diff --git a/.github/scripts/workflow_rerun/errors_to_look_for.json b/.github/scripts/workflow_rerun/errors_to_look_for.json index b9cac8f17adaa6..d8fe6ac2df03d2 100644 --- a/.github/scripts/workflow_rerun/errors_to_look_for.json +++ b/.github/scripts/workflow_rerun/errors_to_look_for.json @@ -86,5 +86,25 @@ { "error_text": "because the GET request got Content-Type", "ticket": 158400 + }, + { + "error_text": "Unable to make request:", + "ticket": 158401 + }, + { + "error_text": "Failed to make request", + "ticket": 158401 + }, + { + "error_text": "Failure when receiving data from the peer", + "ticket": 159323 + }, + { + "error_text": "HTTP response code said error", + "ticket": 159398 + }, + { + "error_text": "download failed after attempts", + "ticket": 159547 } ] \ No newline at end of file diff --git a/.github/workflows/cleanup_caches.yml b/.github/workflows/cleanup_caches.yml index d6633fd9dab3ee..c3aac30ccd4379 100644 --- a/.github/workflows/cleanup_caches.yml +++ b/.github/workflows/cleanup_caches.yml @@ -4,7 +4,7 @@ on: schedule: # at 00:00 on the 1st day of every month - cron: '0 0 1 * *' - + permissions: read-all jobs: @@ -61,8 +61,8 @@ jobs: cache-path: ${{ env.CCACHE_PATH }} recursive: true key: '.' - - + + Cleanup_ccache_win: name: Cleanup Windows ccache runs-on: 'aks-win-4-cores-8gb' diff --git a/.github/workflows/export_workflow_metrics.yml b/.github/workflows/export_workflow_metrics.yml index 39bb699b8caa91..aef00244f8175b 100644 --- a/.github/workflows/export_workflow_metrics.yml +++ b/.github/workflows/export_workflow_metrics.yml @@ -34,7 +34,7 @@ permissions: read-all jobs: export-workflow-metrics: name: Export finished workflow metrics - runs-on: aks-linux-2-cores-8gb + runs-on: aks-linux-2-cores-8gb-stats if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: diff --git a/.github/workflows/job_jax_layer_tests.yml b/.github/workflows/job_jax_layer_tests.yml new file mode 100644 index 00000000000000..25f171060f43be --- /dev/null +++ b/.github/workflows/job_jax_layer_tests.yml @@ -0,0 +1,133 @@ +name: JAX Layer Tests + +on: + workflow_call: + inputs: + runner: + description: 'Machine on which the tests would run' + type: string + required: true + container: + description: 'JSON to be converted to the value of the "container" configuration for the job' + type: string + required: false + default: '{"image": null}' + affected-components: + description: 'Components that are affected by changes in the commit defined by the Smart CI Action' + type: string + required: true + python-version: + description: 'Python version to setup. E.g., "3.11"' + type: string + required: true + +permissions: read-all + +env: + PIP_CACHE_PATH_LINUX: /mount/caches/pip/linux + PIP_CACHE_PATH_WIN: "C:\\mount\\caches\\pip\\win" + +jobs: + JAX_Layer_Tests: + name: JAX Layer Tests + timeout-minutes: 40 + runs-on: ${{ inputs.runner }} + container: ${{ fromJSON(inputs.container) }} + defaults: + run: + shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }} + env: + DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input + OPENVINO_REPO: ${{ github.workspace }}/openvino + INSTALL_DIR: ${{ github.workspace }}/install + INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests + INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels + LAYER_TESTS_INSTALL_DIR: ${{ github.workspace }}/install/tests/layer_tests + steps: + - name: Download OpenVINO artifacts (tarballs) + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + pattern: openvino_[tests]* + path: ${{ env.INSTALL_DIR }} + merge-multiple: true + + - name: Download OpenVINO artifacts (wheels) + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + pattern: openvino_[wheels]* + path: ${{ env.INSTALL_WHEELS_DIR }} + merge-multiple: true + + # Needed as ${{ github.workspace }} is not working correctly when using Docker + - name: Setup Variables + if: runner.os != 'Windows' + run: | + echo "OPENVINO_REPO=$GITHUB_WORKSPACE/openvino" >> "$GITHUB_ENV" + echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" + echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV" + echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV" + + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz + + - name: Extract OpenVINO packages (Linux, macOS) + if: runner.os != 'Windows' + run: | + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} + working-directory: ${{ env.INSTALL_DIR }} + + - name: Extract OpenVINO artifacts (Windows) + if: runner.os == 'Windows' + run: | + Expand-Archive openvino_tests.zip -DestinationPath ${{ env.INSTALL_DIR }} + working-directory: ${{ env.INSTALL_DIR }} + + - name: Fetch setup_python and install wheels actions + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 + with: + sparse-checkout: | + .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml + sparse-checkout-cone-mode: false + path: 'openvino' + + - name: Setup Python ${{ inputs.python-version }} + uses: ./openvino/.github/actions/setup_python + with: + version: ${{ inputs.python-version }} + pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH_LINUX || env.PIP_CACHE_PATH_WIN }} + should-setup-pip-paths: ${{ runner.os != 'macOS' }} + self-hosted-runner: ${{ runner.os != 'macOS' }} + + - name: Install OpenVINO Python wheels + uses: ./openvino/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' + + - name: Install JAX Layer tests dependencies + run: | + # jax test requirements + python3 -m pip install -r ${{ env.INSTALL_TEST_DIR }}/requirements_jax + + - name: JAX Layer Tests + if: ${{ fromJSON(inputs.affected-components).JAX_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287, 142196 + run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/jax_tests ${PARALLEL} -m precommit_jax_fe --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-jax.xml + env: + TEST_DEVICE: CPU + TEST_PRECISION: FP16 + JAX_TRACE_MODE: JAXPR + PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}} + + - name: Upload Test Results + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + if: ${{ !cancelled() }} + with: + name: test-results-python-jax-layers + path: | + ${{ env.INSTALL_TEST_DIR }}/TEST*.html + ${{ env.INSTALL_TEST_DIR }}/TEST*.xml + if-no-files-found: 'warn' diff --git a/.github/workflows/job_jax_models_tests.yml b/.github/workflows/job_jax_models_tests.yml index 07155db1016057..57eb07a83aa423 100644 --- a/.github/workflows/job_jax_models_tests.yml +++ b/.github/workflows/job_jax_models_tests.yml @@ -89,7 +89,7 @@ jobs: - name: Install JAX tests requirements for precommit run: | - python3 -m pip install -r ${MODEL_HUB_TESTS_INSTALL_DIR}/jax/requirements.txt + python3 -m pip install -r ${{ env.INSTALL_TEST_DIR }}/requirements_jax - name: JAX/Flax Models Tests from Hugging Face if: ${{ inputs.model_scope == 'precommit' || inputs.model_scope == 'nightly' }} diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml index b04f719c8e296f..e1532d530ff2db 100644 --- a/.github/workflows/job_python_unit_tests.yml +++ b/.github/workflows/job_python_unit_tests.yml @@ -162,14 +162,6 @@ jobs: export LD_LIBRARY_PATH=${PIP_INSTALL_PATH}/openvino/libs:$LD_LIBRARY_PATH python3 -m pytest ${LAYER_TESTS_INSTALL_DIR}/py_frontend_tests --junitxml=${INSTALL_TEST_DIR}/TEST-test_py_fontend.xml - - name: JAX Layer Tests - JAX FE - if: ${{ fromJSON(inputs.affected-components).JAX_FE.test && runner.arch != 'ARM64' && runner.os != 'macOS' }} - run: python3 -m pytest ${LAYER_TESTS_INSTALL_DIR}/jax_tests/ -m precommit_jax_fe --junitxml=${INSTALL_TEST_DIR}/TEST-jax_fe.xml - env: - TEST_DEVICE: CPU - TEST_PRECISION: FP16 - JAX_TRACE_MODE: JAXPR - - name: TensorFlow Lite Layer Tests - TFL FE if: fromJSON(inputs.affected-components).TFL_FE.test run: python3 -m pytest ${LAYER_TESTS_INSTALL_DIR}/tensorflow_lite_tests/ -n logical --junitxml=${INSTALL_TEST_DIR}/TEST-tfl_fe.xml diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 66e825e5d5e126..ca1ca6e056e23d 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -202,6 +202,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + JAX_Layer_Tests: + name: JAX Layer Tests + needs: [ Build, Docker, Smart_CI ] + uses: ./.github/workflows/job_jax_layer_tests.yml + with: + runner: 'aks-linux-16-cores-32gb-arm' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' + CPU_Functional_Tests: name: CPU functional tests if: fromJSON(needs.smart_ci.outputs.affected_components).CPU.test diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 5e4335b8151c02..0fbc20cf19594b 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -356,6 +356,15 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + JAX_Layer_Tests: + name: JAX Layer Tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_jax_layer_tests.yml + with: + runner: 'macos-13' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' + CPU_Functional_Tests: name: CPU functional tests # if: fromJSON(needs.smart_ci.outputs.affected_components).CPU.test diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml index 855d76973cc2e4..b60daefa442c83 100644 --- a/.github/workflows/mac_arm64.yml +++ b/.github/workflows/mac_arm64.yml @@ -355,6 +355,15 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + JAX_Layer_Tests: + name: JAX Layer Tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_jax_layer_tests.yml + with: + runner: 'macos-13-xlarge' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' + CPU_Functional_Tests: name: CPU functional tests if: fromJSON(needs.smart_ci.outputs.affected_components).CPU.test diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index 5aed74bbb242b8..e5c7d25003de1e 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -334,6 +334,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + JAX_Layer_Tests: + name: JAX Layer Tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_jax_layer_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' + CPU_Functional_Tests: name: CPU functional tests if: fromJSON(needs.smart_ci.outputs.affected_components).CPU.test diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml index 25be095e692d35..beac15bfbda97d 100644 --- a/.github/workflows/ubuntu_24.yml +++ b/.github/workflows/ubuntu_24.yml @@ -156,6 +156,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.12' + JAX_Layer_Tests: + name: JAX Layer Tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_jax_layer_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.12' + TensorFlow_Layer_Tests: name: TensorFlow Layer Tests needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ] diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index f1fd0be596baa2..de33f2603d7430 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -499,6 +499,15 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + JAX_Layer_Tests: + name: JAX Layer Tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_jax_layer_tests.yml + with: + runner: 'aks-win-8-cores-16gb' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' + CXX_Unit_Tests: name: C++ unit tests needs: [ Build, Smart_CI ] diff --git a/.github/workflows/workflow_rerunner.yml b/.github/workflows/workflow_rerunner.yml index 0d8d6610bea588..535101ec943264 100644 --- a/.github/workflows/workflow_rerunner.yml +++ b/.github/workflows/workflow_rerunner.yml @@ -29,7 +29,7 @@ jobs: name: Rerun Workflow # Run only for the failed workflows in openvinotoolkit org if: ${{ github.event.workflow_run.conclusion == 'failure' && github.repository_owner == 'openvinotoolkit' }} - runs-on: aks-linux-2-cores-8gb + runs-on: aks-linux-2-cores-8gb-stats permissions: actions: write contents: read @@ -70,7 +70,7 @@ jobs: rerunner_tests: name: Rerunner Tests if: ${{ github.event_name == 'pull_request' && github.repository_owner == 'openvinotoolkit' }} - runs-on: aks-linux-2-cores-8gb + runs-on: aks-linux-2-cores-8gb-stats steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -96,9 +96,9 @@ jobs: working-directory: ${{ github.workspace }}/.github/scripts/workflow_rerun run: | export PYTHONPATH=${{ github.workspace }}/.github/scripts/workflow_rerun:${{ github.workspace }}/.github/scripts:$PYTHONPATH - + # Need to get a run id with successful status for log analyzing # cannot lock a run id as logs get deleted after some time run_id=$(python3 -c "from github import Github, Auth; import os; github=Github(auth=Auth.Token(token=os.environ.get('GITHUB_TOKEN'))); repo = github.get_repo('${GITHUB_REPOSITORY}'); run_id = repo.get_workflow_runs(status='success')[0].id; print(run_id)") - + python3 rerunner.py --repository-name ${GITHUB_REPOSITORY} --run-id $run_id --dry-run diff --git a/docs/articles_en/about-openvino/compatibility-and-support/supported-models.rst b/docs/articles_en/about-openvino/compatibility-and-support/supported-models.rst index d877cb1768d44d..f4ec275491fa32 100644 --- a/docs/articles_en/about-openvino/compatibility-and-support/supported-models.rst +++ b/docs/articles_en/about-openvino/compatibility-and-support/supported-models.rst @@ -6,16 +6,14 @@ models from OpenVINO-supported frameworks may also work properly but have not be **AI Models that run on Intel® Core Ultra™ Processors with OpenVINO™ toolkit:** -.. raw:: html - - - - -.. csv-table:: +.. data-table:: :class: modeldata stripe :name: supportedModelsTable :header-rows: 1 :file: ../../_static/download/supported_models.csv + :data-column-hidden: [] + :data-order: [[ 0, "asc" ]] + :data-page-length: 10 | Marked cells indicate models that passed inference with no errors. Empty cells indicate diff --git a/docs/articles_en/about-openvino/compatibility-and-support/supported-operations.rst b/docs/articles_en/about-openvino/compatibility-and-support/supported-operations.rst index d27f7626391f46..1bd8f5dae7c634 100644 --- a/docs/articles_en/about-openvino/compatibility-and-support/supported-operations.rst +++ b/docs/articles_en/about-openvino/compatibility-and-support/supported-operations.rst @@ -41,27 +41,36 @@ Data as of OpenVINO 2024.4, 18 Oct. 2024. .. tab-item:: PyTorch - .. csv-table:: + .. data-table:: :class: modeldata stripe - :name: TensorFlow ops + :name: TensorFlow_ops_v1 :header-rows: 1 :file: ../../_static/conformance_files/pytorch_ops.csv + :data-column-hidden: [] + :data-order: [[ 0, "asc" ]] + :data-page-length: 10 .. tab-item:: TensorFlow - .. csv-table:: + .. data-table:: :class: modeldata stripe - :name: TensorFlow ops + :name: TensorFlow_ops_v2 :header-rows: 1 :file: ../../_static/conformance_files/tensorflow_ops.csv + :data-column-hidden: [] + :data-order: [[ 0, "asc" ]] + :data-page-length: 10 .. tab-item:: PaddlePaddle - .. csv-table:: + .. data-table:: :class: modeldata stripe - :name: Paddle ops + :name: Paddle_ops :header-rows: 1 :file: ../../_static/conformance_files/paddlepaddle_ops.csv + :data-column-hidden: [] + :data-order: [[ 0, "asc" ]] + :data-page-length: 10 .. tab-item:: ONNX diff --git a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst index 085a1ff8449151..83581d465df92e 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst @@ -8,10 +8,6 @@ The current data is as of OpenVINO 2024.4, 20 Nov. 2024. The tables below list the key performance indicators for inference on built-in GPUs. -.. raw:: html - - - .. tab-set:: @@ -22,7 +18,9 @@ The tables below list the key performance indicators for inference on built-in G :name: supportedModelsTable_V1 :header-rows: 1 :file: ../../_static/benchmarks_files/llm_models_9-288V.csv - :hidden: [3,4,6] + :data-column-hidden: [3,4,6] + :data-order: [[ 0, "asc" ]] + :data-page-length: 10 .. tab-item:: 7-268V @@ -31,7 +29,8 @@ The tables below list the key performance indicators for inference on built-in G :name: supportedModelsTable_V2 :header-rows: 1 :file: ../../_static/benchmarks_files/llm_models_7-258V.csv - :hidden: [3,4,6] + :data-column-hidden: [3,4,6] + :data-order: [[ 0, "asc" ]] .. tab-item:: 7-155H @@ -40,7 +39,8 @@ The tables below list the key performance indicators for inference on built-in G :name: supportedModelsTable_V3 :header-rows: 1 :file: ../../_static/benchmarks_files/llm_models_7-155H.csv - :hidden: [3,4,6] + :data-column-hidden: [3,4,6] + :data-order: [[ 0, "asc" ]] .. grid:: 1 1 2 2 diff --git a/docs/articles_en/openvino-workflow/model-preparation/convert-model-pytorch.rst b/docs/articles_en/openvino-workflow/model-preparation/convert-model-pytorch.rst index 6ac806daf0cda0..62cfdf05f2b11f 100644 --- a/docs/articles_en/openvino-workflow/model-preparation/convert-model-pytorch.rst +++ b/docs/articles_en/openvino-workflow/model-preparation/convert-model-pytorch.rst @@ -203,6 +203,52 @@ Here is an example of how to convert a model obtained with ``torch.export``: This is an experimental feature. Use it only if you know that you need to. PyTorch version 2.2 is recommended. Dynamic shapes are not supported yet. +Converting a PyTorch Model from Disk +#################################### + +PyTorch provides the capability to save models in two distinct formats: ``torch.jit.ScriptModule`` and ``torch.export.ExportedProgram``. +Both formats can be saved to disk as standalone files, enabling them to be reloaded independently of the original Python code. + +ExportedProgram Format +++++++++++++++++++++++ + +The ``ExportedProgram`` format is saved on disk using `torch.export.save() `__. +Below is an example of how to convert an ``ExportedProgram`` from disk: + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: py + :force: + + import openvino as ov + ov_model = ov.convert_model('exported_program.pt2') + + .. tab-item:: CLI + :sync: cli + + .. code-block:: sh + + ovc exported_program.pt2 + +ScriptModule Format ++++++++++++++++++++ + +`torch.jit.save() `__ serializes ``ScriptModule`` object on disk. +To convert the serialized ``ScriptModule`` format, run ``convert_model`` function with ``example_input`` parameter as follows: + +.. code-block:: py + :force: + + from openvino import convert_model + import torch + + convert_model(input_model='script_module.pt', example_input=torch.rand(1, 10)) + +``example_input`` is the required parameter for the conversion because ``torch.jit.ScriptModule`` object is always saved in an untraced state on disk. + Exporting a PyTorch Model to ONNX Format ######################################## diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py b/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py index c3e0e81eec3b3a..814517289ce114 100644 --- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py +++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py @@ -11,7 +11,7 @@ import requests import re import json - +import html import csv logger = logging.getLogger(__name__) @@ -147,7 +147,9 @@ class DataTable(Directive): 'file': directives.path, 'class': directives.unchanged, 'name': directives.unchanged, - 'hidden': directives.unchanged + 'data-column-hidden': directives.unchanged, + 'data-page-length': directives.unchanged, + 'data-order': directives.unchanged } def run(self) -> List[Node]: @@ -159,10 +161,12 @@ def run(self) -> List[Node]: csv_node = [] with open(csv_file, 'r') as j: csv_data = list(csv.reader(j)) - class_table_tag = ' class="' + "".join(c for c in str(self.options['class']) + '"') if 'class' in self.options is not None else "" - id_table_tag = ' id="' + "".join(c for c in str(self.options['name']) + '"') if 'name' in self.options is not None else "" - hidden_table_tag = ' data-columns-hidden="' + "".join(c for c in str(self.options['hidden']) + '"') if 'hidden' in self.options is not None else "" - csv_table_html = '' + class_table_tag = f' class="{html.escape(self.options["class"])}"' if "class" in self.options else "" + id_table_tag = f' id="{html.escape(self.options["name"])}"' if "name" in self.options else "" + data_column_hidden_tag = f' data-column-hidden="{html.escape(self.options["data-column-hidden"])}"' if "data-column-hidden" in self.options else "" + data_order_tag = f' data-order="{html.escape(self.options["data-order"])}"' if "data-order" in self.options else "" + data_page_length_tag = f' data-page-length="{html.escape(self.options["data-page-length"])}"' if "data-page-length" in self.options else "" + csv_table_html = f'' head_rows = 0 head_rows += self.options.get('header-rows', 0) row_count = 0 diff --git a/docs/sphinx_setup/_static/css/custom.css b/docs/sphinx_setup/_static/css/custom.css index de8a05732a4d06..1679f7309da044 100644 --- a/docs/sphinx_setup/_static/css/custom.css +++ b/docs/sphinx_setup/_static/css/custom.css @@ -69,7 +69,7 @@ a#wap_dns { /* Sphinx-design tabs override */ .sd-tab-set>input:checked+label { color: var(--sd-color-black) !important; - background-color: #f8f8f8 !important; + background-color: white !important; border: solid 1px #bdbdbd; border-bottom: solid 0px; margin-bottom: -1px; @@ -96,7 +96,7 @@ a#wap_dns { cursor: pointer; font-size: var(--sd-fontsize-tabs-label); font-weight: 400 !important; - padding: 5px 16px 2px !important; + padding: 5px 16px 0px !important; transition: color 250ms; width: auto; z-index: 1; @@ -110,7 +110,6 @@ a#wap_dns { box-shadow: 0 0 0 0; border: solid 1px var(--sd-color-tabs-overline); border-color: #bdbdbd; - background-color: #f8f8f8; padding-right: 4px; padding-left: 4px; padding-bottom: 6px; diff --git a/docs/sphinx_setup/_static/css/openVinoDataTables.css b/docs/sphinx_setup/_static/css/openVinoDataTables.css index 526aabb6abe15d..bedc0f5206e260 100644 --- a/docs/sphinx_setup/_static/css/openVinoDataTables.css +++ b/docs/sphinx_setup/_static/css/openVinoDataTables.css @@ -6,8 +6,7 @@ div.dt-buttons>.dt-button, div.dt-buttons>div.dt-button-split .dt-button { } div.dt-container .dt-paging .dt-paging-button:hover { - color: white !important; - border: 1px solid #aaa; + border: 1px solid #aaa !important; background:none !important; background-color: var(--bttn-act-bg-hover) !important } @@ -190,10 +189,9 @@ div.dt-container .dt-paging .dt-paging-button { div.dt-container .dt-paging .dt-paging-button.current, div.dt-container .dt-paging .dt-paging-button.current:hover { background: none !important; - background-color: var(--bttn-act-bg-active) !important; + background-color: var(--bttn-sec-border-color) !important; border-color: var(--bttn-act-bg-active) !important; border-radius: 0px !important; - color: white !important; border: 1px !important } table.dataTable thead>tr>th.dt-orderable-asc span.dt-column-order:before, table.dataTable thead>tr>th.dt-orderable-asc span.dt-column-order:after, table.dataTable thead>tr>th.dt-orderable-desc span.dt-column-order:before, table.dataTable thead>tr>th.dt-orderable-desc span.dt-column-order:after, table.dataTable thead>tr>th.dt-ordering-asc span.dt-column-order:before, table.dataTable thead>tr>th.dt-ordering-asc span.dt-column-order:after, table.dataTable thead>tr>th.dt-ordering-desc span.dt-column-order:before, table.dataTable thead>tr>th.dt-ordering-desc span.dt-column-order:after, table.dataTable thead>tr>td.dt-orderable-asc span.dt-column-order:before, table.dataTable thead>tr>td.dt-orderable-asc span.dt-column-order:after, table.dataTable thead>tr>td.dt-orderable-desc span.dt-column-order:before, table.dataTable thead>tr>td.dt-orderable-desc span.dt-column-order:after, table.dataTable thead>tr>td.dt-ordering-asc span.dt-column-order:before, table.dataTable thead>tr>td.dt-ordering-asc span.dt-column-order:after, table.dataTable thead>tr>td.dt-ordering-desc span.dt-column-order:before, table.dataTable thead>tr>td.dt-ordering-desc span.dt-column-order:after { diff --git a/docs/sphinx_setup/_static/js/openVinoDataTables.js b/docs/sphinx_setup/_static/js/openVinoDataTables.js index bd56a71533786c..fb3a57d959020c 100644 --- a/docs/sphinx_setup/_static/js/openVinoDataTables.js +++ b/docs/sphinx_setup/_static/js/openVinoDataTables.js @@ -1,16 +1,15 @@ $(document).ready(function () { var columnDefs = []; - var tables = $('table.modeldata'); for (let table of tables) { - var hidden = table.getAttribute('data-columns-hidden'); + var hidden = table.getAttribute('data-column-hidden'); columnDefs = [{ "visible": false, "targets": JSON.parse(hidden) }] $(table).DataTable({ responsive: true, "autoWidth": false, language: { buttons: { - colvisRestore: "Restore default" + colvisRestore: "Restore default selection" } }, lengthMenu: [ diff --git a/docs/sphinx_setup/_templates/layout.html b/docs/sphinx_setup/_templates/layout.html index 0d2331b2c83fe3..a791091e1f13a4 100644 --- a/docs/sphinx_setup/_templates/layout.html +++ b/docs/sphinx_setup/_templates/layout.html @@ -9,6 +9,7 @@ + diff --git a/src/bindings/python/src/openvino/__init__.py b/src/bindings/python/src/openvino/__init__.py index 7643f742e0067d..69c678909b1c9e 100644 --- a/src/bindings/python/src/openvino/__init__.py +++ b/src/bindings/python/src/openvino/__init__.py @@ -7,7 +7,7 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.package_utils import _add_openvino_libs_to_search_path + from openvino.utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass @@ -17,47 +17,6 @@ # # This __init__.py forces checking of runtime modules to propagate errors. # # It is not compared with init files from openvino-dev package. # # - -# Openvino pybind bindings -from openvino._pyopenvino import AxisSet -from openvino._pyopenvino import AxisVector -from openvino._pyopenvino import ConstOutput -from openvino._pyopenvino import Coordinate -from openvino._pyopenvino import CoordinateDiff -from openvino._pyopenvino import DiscreteTypeInfo -from openvino._pyopenvino import Extension -from openvino._pyopenvino import ProfilingInfo -from openvino._pyopenvino import RTMap -from openvino._pyopenvino import Version -from openvino._pyopenvino import Symbol -from openvino._pyopenvino import Dimension -from openvino._pyopenvino import Input -from openvino._pyopenvino import Output -from openvino._pyopenvino import Node -from openvino._pyopenvino import Strides -from openvino._pyopenvino import PartialShape -from openvino._pyopenvino import Shape -from openvino._pyopenvino import Layout -from openvino._pyopenvino import Type -from openvino._pyopenvino import Tensor -from openvino._pyopenvino import OVAny -from openvino._pyopenvino import get_batch -from openvino._pyopenvino import set_batch -from openvino._pyopenvino import serialize -from openvino._pyopenvino import shutdown -from openvino._pyopenvino import save_model -from openvino._pyopenvino import layout_helpers -from openvino._pyopenvino import RemoteContext -from openvino._pyopenvino import RemoteTensor -from openvino._pyopenvino import Op - -# Import public classes from _ov_api -from openvino._ov_api import Model -from openvino._ov_api import Core -from openvino._ov_api import CompiledModel -from openvino._ov_api import InferRequest -from openvino._ov_api import AsyncInferQueue - # Import all public modules from openvino import runtime as runtime from openvino import frontend as frontend @@ -67,10 +26,36 @@ from openvino import utils as utils from openvino import properties as properties +# Import most important classes and functions from openvino.runtime +from openvino._ov_api import Model +from openvino._ov_api import Core +from openvino._ov_api import CompiledModel +from openvino._ov_api import InferRequest +from openvino._ov_api import AsyncInferQueue + +from openvino.runtime import Symbol +from openvino.runtime import Dimension +from openvino.runtime import Strides +from openvino.runtime import PartialShape +from openvino.runtime import Shape +from openvino.runtime import Layout +from openvino.runtime import Type +from openvino.runtime import Tensor +from openvino.runtime import OVAny + # Helper functions for openvino module -from openvino.utils.data_helpers import tensor_from_file +from openvino.runtime.utils.data_helpers import tensor_from_file from openvino._ov_api import compile_model +from openvino.runtime import get_batch +from openvino.runtime import set_batch +from openvino.runtime import serialize +from openvino.runtime import shutdown +from openvino.runtime import save_model +from openvino.runtime import layout_helpers +from openvino._pyopenvino import RemoteContext +from openvino._pyopenvino import RemoteTensor +from openvino._pyopenvino import Op # Import opsets from openvino import opset1 @@ -95,7 +80,7 @@ from openvino._pyopenvino import VASurfaceTensor # Set version for openvino package -from openvino._pyopenvino import get_version +from openvino.runtime import get_version __version__ = get_version() # Tools diff --git a/src/bindings/python/src/openvino/_ov_api.py b/src/bindings/python/src/openvino/_ov_api.py index da31fab4c95d8e..53d0fa5316498b 100644 --- a/src/bindings/python/src/openvino/_ov_api.py +++ b/src/bindings/python/src/openvino/_ov_api.py @@ -5,7 +5,9 @@ from types import TracebackType from typing import Any, Iterable, Union, Optional, Dict, Type from pathlib import Path +import warnings +import numpy as np from openvino._pyopenvino import Model as ModelBase from openvino._pyopenvino import Core as CoreBase @@ -14,7 +16,7 @@ from openvino._pyopenvino import Tensor from openvino._pyopenvino import Node -from openvino.utils.data_helpers import ( +from openvino.runtime.utils.data_helpers import ( OVDict, _InferRequestWrapper, _data_dispatch, diff --git a/src/bindings/python/src/openvino/frontend/frontend.py b/src/bindings/python/src/openvino/frontend/frontend.py index 6a16d5a573b7d7..4d549d24b4ef7c 100644 --- a/src/bindings/python/src/openvino/frontend/frontend.py +++ b/src/bindings/python/src/openvino/frontend/frontend.py @@ -7,7 +7,7 @@ from openvino._pyopenvino import FrontEnd as FrontEndBase from openvino._pyopenvino import FrontEndManager as FrontEndManagerBase from openvino._pyopenvino import InputModel -from openvino import Model +from openvino.runtime import Model class FrontEnd(FrontEndBase): diff --git a/src/bindings/python/src/openvino/frontend/jax/jaxpr_decoder.py b/src/bindings/python/src/openvino/frontend/jax/jaxpr_decoder.py index 9072598f824939..914f6b2e2ee548 100644 --- a/src/bindings/python/src/openvino/frontend/jax/jaxpr_decoder.py +++ b/src/bindings/python/src/openvino/frontend/jax/jaxpr_decoder.py @@ -6,7 +6,7 @@ import jax.core from openvino.frontend.jax.py_jax_frontend import _FrontEndJaxDecoder as Decoder -from openvino import PartialShape, Type as OVType, OVAny +from openvino.runtime import PartialShape, Type as OVType, OVAny from openvino.frontend.jax.utils import jax_array_to_ov_const, get_ov_type_for_value, \ ivalue_to_constant, param_to_constants diff --git a/src/bindings/python/src/openvino/frontend/jax/utils.py b/src/bindings/python/src/openvino/frontend/jax/utils.py index 659677b11d5af8..4535265d6de082 100644 --- a/src/bindings/python/src/openvino/frontend/jax/utils.py +++ b/src/bindings/python/src/openvino/frontend/jax/utils.py @@ -8,7 +8,7 @@ import jax.numpy as jnp import numpy as np from openvino.frontend.jax.passes import filter_element, filter_ivalue, filter_param -from openvino import op, Type as OVType, Shape, OVAny +from openvino.runtime import op, Type as OVType, Shape, OVAny numpy_to_ov_type_map = { np.float32: OVType.f32, diff --git a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py index 81a2764ee1188d..c448571f1ac17a 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py @@ -10,7 +10,7 @@ from openvino.frontend.pytorch.py_pytorch_frontend import _FrontEndPytorchDecoder as Decoder from openvino.frontend.pytorch.py_pytorch_frontend import _Type as DecoderType -from openvino import PartialShape, Type as OVType, OVAny, Shape +from openvino.runtime import PartialShape, Type as OVType, OVAny, Shape from openvino.frontend.pytorch.utils import make_constant, fetch_attr, pt_to_ov_type_map, torch_tensor_to_ov_const logger = logging.getLogger(__name__) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py index a9a65781dcb254..9f2ef019769875 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py @@ -18,7 +18,7 @@ from torch._decomp import decomposition_table, get_decompositions from openvino.frontend import FrontEndManager -from openvino import Core, Type, PartialShape +from openvino.runtime import Core, Type, PartialShape from openvino.frontend.pytorch.ts_decoder import TorchScriptPythonDecoder from openvino.frontend.pytorch.torchdynamo import decompositions from openvino.frontend.pytorch.torchdynamo.decompositions import get_aot_decomposition_list, get_inf_decomposition_list @@ -27,7 +27,7 @@ from openvino.frontend.pytorch.torchdynamo.compile import cached_model_name, openvino_compile_cached_model from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device, _get_model_caching, _get_decompositions, _get_aot_autograd -from openvino import Core, Type, PartialShape +from openvino.runtime import Core, Type, PartialShape logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend_utils.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend_utils.py index c9a772b3feac42..47b3b82806b18b 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend_utils.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend_utils.py @@ -5,7 +5,7 @@ # mypy: ignore-errors from typing import Optional, Any -from openvino import Core +from openvino.runtime import Core def _get_device(options) -> Optional[Any]: diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py index ca8d5478e76c15..fa446893a05d07 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py @@ -14,7 +14,7 @@ from openvino.frontend import FrontEndManager from openvino.frontend.pytorch.fx_decoder import TorchFXPythonDecoder -from openvino import Core, Type, PartialShape, serialize +from openvino.runtime import Core, Type, PartialShape, serialize from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device, _get_config, _is_cache_dir_in_config from typing import Callable, Optional diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py index 7527ad7acb37a4..4f41f7b5a6a9de 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py @@ -20,7 +20,7 @@ from openvino.frontend.pytorch.fx_decoder import TorchFXPythonDecoder from openvino.frontend.pytorch.torchdynamo.partition import Partitioner from openvino.frontend.pytorch.torchdynamo.compile import openvino_compile -from openvino import Core, Type, PartialShape +from openvino.runtime import Core, Type, PartialShape from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device, _get_aot_autograd from typing import Callable, Optional, Any diff --git a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py index 7bb8073167a654..6d8fdb1658793e 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py @@ -6,7 +6,7 @@ from openvino.frontend.pytorch.py_pytorch_frontend import _FrontEndPytorchDecoder as Decoder from openvino.frontend.pytorch.py_pytorch_frontend import _Type as DecoderType -from openvino import op, PartialShape, Type as OVType, OVAny +from openvino.runtime import op, PartialShape, Type as OVType, OVAny from openvino.frontend.pytorch.utils import ( ivalue_to_constant, get_value_from_getattr, @@ -15,7 +15,7 @@ convert_quantized_tensor, graph_has_ops, ) -from openvino import opset11 as ops +from openvino.runtime import opset11 as ops from openvino.frontend.pytorch import quantized, patch_model from openvino.frontend.pytorch.module_extension import ModuleExtension diff --git a/src/bindings/python/src/openvino/frontend/pytorch/utils.py b/src/bindings/python/src/openvino/frontend/pytorch/utils.py index 9ba36707037c9e..826d766505fa79 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/utils.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/utils.py @@ -7,8 +7,8 @@ import torch import numpy as np -from openvino import op, Type as OVType, Shape, Tensor -from openvino import opset11 as ops +from openvino.runtime import op, Type as OVType, Shape, Tensor +from openvino.runtime import opset11 as ops def make_constant(*args, **kwargs): diff --git a/src/bindings/python/src/openvino/frontend/tensorflow/node_decoder.py b/src/bindings/python/src/openvino/frontend/tensorflow/node_decoder.py index d15262cbc30366..fcedd7a74c2b51 100644 --- a/src/bindings/python/src/openvino/frontend/tensorflow/node_decoder.py +++ b/src/bindings/python/src/openvino/frontend/tensorflow/node_decoder.py @@ -7,7 +7,7 @@ import numpy as np import tensorflow as tf from openvino.frontend.tensorflow.py_tensorflow_frontend import _FrontEndDecoderBase as DecoderBase -from openvino import PartialShape, Type, OVAny, Tensor +from openvino.runtime import PartialShape, Type, OVAny, Tensor def tf_type_to_ov_type(tf_type_int): diff --git a/src/bindings/python/src/openvino/frontend/tensorflow/utils.py b/src/bindings/python/src/openvino/frontend/tensorflow/utils.py index 7de5dc950be53e..74c0dfff92297e 100644 --- a/src/bindings/python/src/openvino/frontend/tensorflow/utils.py +++ b/src/bindings/python/src/openvino/frontend/tensorflow/utils.py @@ -8,7 +8,7 @@ import logging as log import numpy as np import sys -from openvino import PartialShape, Dimension, Type +from openvino.runtime import PartialShape, Dimension, Type from packaging.version import parse, Version from typing import List, Dict, Union diff --git a/src/bindings/python/src/openvino/helpers/packing.py b/src/bindings/python/src/openvino/helpers/packing.py index d0956e09fc6261..796af87402f3a6 100644 --- a/src/bindings/python/src/openvino/helpers/packing.py +++ b/src/bindings/python/src/openvino/helpers/packing.py @@ -5,7 +5,7 @@ import numpy as np from typing import Union -from openvino import Type, Shape +from openvino.runtime import Type, Shape def pack_data(array: np.ndarray, type: Type) -> np.ndarray: diff --git a/src/bindings/python/src/openvino/opset1/ops.py b/src/bindings/python/src/openvino/opset1/ops.py index e264aea304fb1f..edca6c62a0b246 100644 --- a/src/bindings/python/src/openvino/opset1/ops.py +++ b/src/bindings/python/src/openvino/opset1/ops.py @@ -8,17 +8,17 @@ import numpy as np from functools import partial -from openvino import Node, PartialShape, Type +from openvino.runtime import Node, PartialShape, Type from openvino.op import Constant, Parameter, tensor_iterator -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.decorators import binary_op, nameable_op, unary_op -from openvino.utils.input_validation import ( +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op +from openvino.runtime.utils.input_validation import ( check_valid_attributes, is_non_negative_value, is_positive_value, ) -from openvino.utils.node_factory import NodeFactory -from openvino.utils.types import ( +from openvino.runtime.utils.node_factory import NodeFactory +from openvino.runtime.utils.types import ( NodeInput, NumericData, NumericType, diff --git a/src/bindings/python/src/openvino/opset10/ops.py b/src/bindings/python/src/openvino/opset10/ops.py index d0bc3cbf1cba4a..c7b75777484a59 100644 --- a/src/bindings/python/src/openvino/opset10/ops.py +++ b/src/bindings/python/src/openvino/opset10/ops.py @@ -6,10 +6,10 @@ from functools import partial from typing import List, Optional -from openvino import Node -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.decorators import nameable_op -from openvino.utils.types import ( +from openvino.runtime import Node +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import nameable_op +from openvino.runtime.utils.types import ( NodeInput, as_nodes, as_node, diff --git a/src/bindings/python/src/openvino/opset11/ops.py b/src/bindings/python/src/openvino/opset11/ops.py index 95767b4800db1c..575c99501d2d6c 100644 --- a/src/bindings/python/src/openvino/opset11/ops.py +++ b/src/bindings/python/src/openvino/opset11/ops.py @@ -6,10 +6,10 @@ from functools import partial from typing import List, Optional -from openvino import Node -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.decorators import nameable_op -from openvino.utils.types import ( +from openvino.runtime import Node +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import nameable_op +from openvino.runtime.utils.types import ( NodeInput, as_nodes, ) diff --git a/src/bindings/python/src/openvino/opset12/ops.py b/src/bindings/python/src/openvino/opset12/ops.py index 4b354b1fcff973..928bf4f71a9773 100644 --- a/src/bindings/python/src/openvino/opset12/ops.py +++ b/src/bindings/python/src/openvino/opset12/ops.py @@ -6,10 +6,10 @@ from functools import partial from typing import Optional -from openvino import Node -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.decorators import nameable_op -from openvino.utils.types import ( +from openvino.runtime import Node +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import nameable_op +from openvino.runtime.utils.types import ( NodeInput, as_nodes, as_node, diff --git a/src/bindings/python/src/openvino/opset13/ops.py b/src/bindings/python/src/openvino/opset13/ops.py index 5c6863740120f8..12f0d06b1a28e6 100644 --- a/src/bindings/python/src/openvino/opset13/ops.py +++ b/src/bindings/python/src/openvino/opset13/ops.py @@ -11,12 +11,12 @@ log = logging.getLogger(__name__) -from openvino import Node, Shape, Type, Output, Tensor +from openvino.runtime import Node, Shape, Type, Output, Tensor from openvino.op import Constant, Result from openvino.opset1 import convert_like -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.decorators import binary_op, nameable_op, unary_op, overloading -from openvino.utils.types import ( +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op, overloading +from openvino.runtime.utils.types import ( NumericData, NodeInput, NumericType, diff --git a/src/bindings/python/src/openvino/opset14/ops.py b/src/bindings/python/src/openvino/opset14/ops.py index 59e1bfd3e89c6f..fa872d24eb7f1a 100644 --- a/src/bindings/python/src/openvino/opset14/ops.py +++ b/src/bindings/python/src/openvino/opset14/ops.py @@ -7,11 +7,11 @@ from typing import Union, Optional, List -from openvino import Node, Type -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.types import TensorShape -from openvino.utils.decorators import nameable_op -from openvino.utils.types import NodeInput, as_node, as_nodes +from openvino.runtime import Node, Type +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.types import TensorShape +from openvino.runtime.utils.decorators import nameable_op +from openvino.runtime.utils.types import NodeInput, as_node, as_nodes _get_node_factory_opset14 = partial(_get_node_factory, "opset14") diff --git a/src/bindings/python/src/openvino/opset15/ops.py b/src/bindings/python/src/openvino/opset15/ops.py index 97d4419fc4834b..8e6b8bd46d5f7c 100644 --- a/src/bindings/python/src/openvino/opset15/ops.py +++ b/src/bindings/python/src/openvino/opset15/ops.py @@ -7,12 +7,12 @@ from typing import List, Literal, Optional import numpy as np -from openvino import Node, Type +from openvino.runtime import Node, Type from openvino.opset1 import convert_like from openvino.opset14 import constant -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.decorators import binary_op, nameable_op -from openvino.utils.types import NodeInput, as_nodes +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import binary_op, nameable_op +from openvino.runtime.utils.types import NodeInput, as_nodes _get_node_factory_opset15 = partial(_get_node_factory, "opset15") diff --git a/src/bindings/python/src/openvino/opset16/ops.py b/src/bindings/python/src/openvino/opset16/ops.py index e5ebdc7a2a11d6..60656f6d993b6a 100644 --- a/src/bindings/python/src/openvino/opset16/ops.py +++ b/src/bindings/python/src/openvino/opset16/ops.py @@ -6,10 +6,10 @@ from functools import partial from typing import Optional -from openvino import Node -from openvino.utils.decorators import nameable_op -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.types import NodeInput, as_nodes +from openvino.runtime import Node +from openvino.runtime.utils.decorators import nameable_op +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.types import NodeInput, as_nodes _get_node_factory_opset16 = partial(_get_node_factory, "opset16") diff --git a/src/bindings/python/src/openvino/opset2/ops.py b/src/bindings/python/src/openvino/opset2/ops.py index f76f608fe9a5c7..45b33f5bc0288b 100644 --- a/src/bindings/python/src/openvino/opset2/ops.py +++ b/src/bindings/python/src/openvino/opset2/ops.py @@ -9,17 +9,18 @@ from functools import partial import warnings -from openvino import Node, Shape +from openvino.runtime import Node, Shape from openvino.op import Constant, Parameter -from openvino.utils.decorators import binary_op, nameable_op, unary_op -from openvino.utils.input_validation import ( +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op +from openvino.runtime.utils.input_validation import ( assert_list_of_ints, check_valid_attributes, is_non_negative_value, is_positive_value, ) -from openvino.utils.node_factory import NodeFactory, _get_node_factory -from openvino.utils.types import ( +from openvino.runtime.utils.node_factory import NodeFactory +from openvino.runtime.utils.types import ( NodeInput, NumericData, NumericType, diff --git a/src/bindings/python/src/openvino/opset3/ops.py b/src/bindings/python/src/openvino/opset3/ops.py index 1c2c7e309fe919..989f5819acb685 100644 --- a/src/bindings/python/src/openvino/opset3/ops.py +++ b/src/bindings/python/src/openvino/opset3/ops.py @@ -8,17 +8,18 @@ import numpy as np from functools import partial -from openvino import Node, Shape +from openvino.runtime import Node, Shape from openvino.op import Constant, Parameter -from openvino.utils.decorators import binary_op, nameable_op, unary_op -from openvino.utils.input_validation import ( +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op +from openvino.runtime.utils.input_validation import ( assert_list_of_ints, check_valid_attributes, is_non_negative_value, is_positive_value, ) -from openvino.utils.node_factory import NodeFactory, _get_node_factory -from openvino.utils.types import ( +from openvino.runtime.utils.node_factory import NodeFactory +from openvino.runtime.utils.types import ( NodeInput, NumericData, NumericType, diff --git a/src/bindings/python/src/openvino/opset4/ops.py b/src/bindings/python/src/openvino/opset4/ops.py index e6f3a3a1550937..4f6ba016852b02 100644 --- a/src/bindings/python/src/openvino/opset4/ops.py +++ b/src/bindings/python/src/openvino/opset4/ops.py @@ -8,17 +8,18 @@ import numpy as np from functools import partial -from openvino import Node, Shape +from openvino.runtime import Node, Shape from openvino.op import Constant, Parameter -from openvino.utils.decorators import binary_op, nameable_op, unary_op -from openvino.utils.input_validation import ( +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op +from openvino.runtime.utils.input_validation import ( assert_list_of_ints, check_valid_attributes, is_non_negative_value, is_positive_value, ) -from openvino.utils.node_factory import NodeFactory, _get_node_factory -from openvino.utils.types import ( +from openvino.runtime.utils.node_factory import NodeFactory +from openvino.runtime.utils.types import ( NodeInput, NumericData, NumericType, diff --git a/src/bindings/python/src/openvino/opset5/ops.py b/src/bindings/python/src/openvino/opset5/ops.py index 9217830752b1d8..20057b78c7c31d 100644 --- a/src/bindings/python/src/openvino/opset5/ops.py +++ b/src/bindings/python/src/openvino/opset5/ops.py @@ -8,17 +8,18 @@ import numpy as np from functools import partial -from openvino import Node, Shape +from openvino.runtime import Node, Shape from openvino.op import Constant, Parameter, loop -from openvino.utils.decorators import binary_op, nameable_op, unary_op -from openvino.utils.input_validation import ( +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op +from openvino.runtime.utils.input_validation import ( assert_list_of_ints, check_valid_attributes, is_non_negative_value, is_positive_value, ) -from openvino.utils.node_factory import NodeFactory, _get_node_factory -from openvino.utils.types import ( +from openvino.runtime.utils.node_factory import NodeFactory +from openvino.runtime.utils.types import ( NodeInput, NumericData, NumericType, diff --git a/src/bindings/python/src/openvino/opset6/ops.py b/src/bindings/python/src/openvino/opset6/ops.py index 340d0405b4ba23..8020715f20dea3 100644 --- a/src/bindings/python/src/openvino/opset6/ops.py +++ b/src/bindings/python/src/openvino/opset6/ops.py @@ -9,13 +9,13 @@ from functools import partial, singledispatch -from openvino import Node, Type, PartialShape, Output, Shape +from openvino.runtime import Node, Type, PartialShape, Output, Shape from openvino.op import assign, Constant, Parameter from openvino.op import read_value as _read_value from openvino.op.util import VariableInfo, Variable -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.decorators import nameable_op, overloading -from openvino.utils.types import ( +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import nameable_op, overloading +from openvino.runtime.utils.types import ( NodeInput, NumericType, TensorShape, diff --git a/src/bindings/python/src/openvino/opset7/ops.py b/src/bindings/python/src/openvino/opset7/ops.py index e33d266debedf1..59e09b64888eb1 100644 --- a/src/bindings/python/src/openvino/opset7/ops.py +++ b/src/bindings/python/src/openvino/opset7/ops.py @@ -7,17 +7,18 @@ from typing import Callable, Iterable, List, Optional, Set, Union import numpy as np -from openvino import Node, Shape +from openvino.runtime import Node, Shape from openvino.op import Constant, Parameter -from openvino.utils.decorators import binary_op, nameable_op, unary_op -from openvino.utils.input_validation import ( +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op +from openvino.runtime.utils.input_validation import ( assert_list_of_ints, check_valid_attributes, is_non_negative_value, is_positive_value, ) -from openvino.utils.node_factory import NodeFactory, _get_node_factory -from openvino.utils.types import ( +from openvino.runtime.utils.node_factory import NodeFactory +from openvino.runtime.utils.types import ( NodeInput, NumericData, NumericType, diff --git a/src/bindings/python/src/openvino/opset8/ops.py b/src/bindings/python/src/openvino/opset8/ops.py index a9a868e7b541d8..6995d55a28a776 100644 --- a/src/bindings/python/src/openvino/opset8/ops.py +++ b/src/bindings/python/src/openvino/opset8/ops.py @@ -9,15 +9,15 @@ import numpy as np from openvino.exceptions import UserInputError from openvino.op import Constant, Parameter, if_op -from openvino import Node -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.decorators import nameable_op -from openvino.utils.input_validation import ( +from openvino.runtime import Node +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import nameable_op +from openvino.runtime.utils.input_validation import ( check_valid_attributes, is_non_negative_value, is_positive_value, ) -from openvino.utils.types import ( +from openvino.runtime.utils.types import ( NodeInput, TensorShape, as_node, diff --git a/src/bindings/python/src/openvino/opset9/ops.py b/src/bindings/python/src/openvino/opset9/ops.py index e2264845e058dc..a6d45cfd0be2cc 100644 --- a/src/bindings/python/src/openvino/opset9/ops.py +++ b/src/bindings/python/src/openvino/opset9/ops.py @@ -7,10 +7,10 @@ from typing import Optional import numpy as np -from openvino import Node -from openvino.utils.node_factory import _get_node_factory -from openvino.utils.decorators import nameable_op -from openvino.utils.types import ( +from openvino.runtime import Node +from openvino.runtime.opset_utils import _get_node_factory +from openvino.runtime.utils.decorators import nameable_op +from openvino.runtime.utils.types import ( NodeInput, as_nodes, as_node, diff --git a/src/bindings/python/src/openvino/preprocess/torchvision/preprocess_converter.py b/src/bindings/python/src/openvino/preprocess/torchvision/preprocess_converter.py index 717e945217468c..c14635cc118208 100644 --- a/src/bindings/python/src/openvino/preprocess/torchvision/preprocess_converter.py +++ b/src/bindings/python/src/openvino/preprocess/torchvision/preprocess_converter.py @@ -5,7 +5,7 @@ from typing import Callable, Any, Union import logging -import openvino as ov +import openvino.runtime as ov class PreprocessConverter(): diff --git a/src/bindings/python/src/openvino/preprocess/torchvision/torchvision_preprocessing.py b/src/bindings/python/src/openvino/preprocess/torchvision/torchvision_preprocessing.py index 5dad42b47da44a..f8b51afd546f57 100644 --- a/src/bindings/python/src/openvino/preprocess/torchvision/torchvision_preprocessing.py +++ b/src/bindings/python/src/openvino/preprocess/torchvision/torchvision_preprocessing.py @@ -20,10 +20,10 @@ import torchvision.transforms as transforms from torchvision.transforms import InterpolationMode -import openvino as ov -import openvino.opset11 as ops -from openvino import Layout, Type -from openvino.utils.decorators import custom_preprocess_function +import openvino.runtime as ov +import openvino.runtime.opset11 as ops +from openvino.runtime import Layout, Type +from openvino.runtime.utils.decorators import custom_preprocess_function from openvino.preprocess import PrePostProcessor, ResizeAlgorithm, ColorFormat diff --git a/src/bindings/python/src/openvino/runtime/opset_utils.py b/src/bindings/python/src/openvino/runtime/opset_utils.py new file mode 100644 index 00000000000000..475750e71f87c5 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset_utils.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional +import numpy as np + +from openvino.runtime import Node +from openvino.runtime.utils.decorators import nameable_op +from openvino.runtime.utils.node_factory import NodeFactory +from openvino.runtime.utils.types import ( + as_node, + NodeInput, +) + + +def _get_node_factory(opset_version: Optional[str] = None) -> NodeFactory: + """Return NodeFactory configured to create operators from specified opset version.""" + if opset_version: + return NodeFactory(opset_version) + else: + return NodeFactory() diff --git a/src/bindings/python/src/openvino/runtime/opset_utils/__init__.py b/src/bindings/python/src/openvino/runtime/opset_utils/__init__.py deleted file mode 100644 index 6fb3e5f6f0c950..00000000000000 --- a/src/bindings/python/src/openvino/runtime/opset_utils/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - - -from openvino.utils.node_factory import _get_node_factory diff --git a/src/bindings/python/src/openvino/runtime/utils/__init__.py b/src/bindings/python/src/openvino/runtime/utils/__init__.py index 8447e93a907277..73399ccbed2598 100644 --- a/src/bindings/python/src/openvino/runtime/utils/__init__.py +++ b/src/bindings/python/src/openvino/runtime/utils/__init__.py @@ -4,4 +4,4 @@ """Generic utilities. Factor related functions out to separate files.""" -from openvino.utils import numpy_to_c, replace_node, replace_output_update_name +from openvino._pyopenvino.util import numpy_to_c, replace_node, replace_output_update_name diff --git a/src/bindings/python/src/openvino/utils/broadcasting.py b/src/bindings/python/src/openvino/runtime/utils/broadcasting.py similarity index 87% rename from src/bindings/python/src/openvino/utils/broadcasting.py rename to src/bindings/python/src/openvino/runtime/utils/broadcasting.py index 01549625e2c628..9fd13da7728e29 100644 --- a/src/bindings/python/src/openvino/utils/broadcasting.py +++ b/src/bindings/python/src/openvino/runtime/utils/broadcasting.py @@ -3,11 +3,14 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Optional +from typing import List, Optional -from openvino import AxisSet -from openvino.utils.types import ( +from openvino.runtime import AxisSet, Node +from openvino.runtime.utils.types import ( + NodeInput, TensorShape, + get_dtype, + make_constant_node, ) log = logging.getLogger(__name__) diff --git a/src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py b/src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py deleted file mode 100644 index 3219f239f0ab44..00000000000000 --- a/src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from openvino.utils.broadcasting import get_broadcast_axes diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py index 282547dd9df79a..a46105efaaeadb 100644 --- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py +++ b/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py @@ -2,7 +2,7 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.utils.data_helpers.data_dispatcher import _data_dispatch -from openvino.utils.data_helpers.wrappers import tensor_from_file -from openvino.utils.data_helpers.wrappers import _InferRequestWrapper -from openvino.utils.data_helpers.wrappers import OVDict +from openvino.runtime.utils.data_helpers.data_dispatcher import _data_dispatch +from openvino.runtime.utils.data_helpers.wrappers import tensor_from_file +from openvino.runtime.utils.data_helpers.wrappers import _InferRequestWrapper +from openvino.runtime.utils.data_helpers.wrappers import OVDict diff --git a/src/bindings/python/src/openvino/utils/data_helpers/data_dispatcher.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py similarity index 99% rename from src/bindings/python/src/openvino/utils/data_helpers/data_dispatcher.py rename to src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py index d4db7cb07b629c..bce10c9c3774ef 100644 --- a/src/bindings/python/src/openvino/utils/data_helpers/data_dispatcher.py +++ b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py @@ -8,7 +8,7 @@ import numpy as np from openvino._pyopenvino import ConstOutput, Tensor, Type, RemoteTensor -from openvino.utils.data_helpers.wrappers import _InferRequestWrapper, OVDict +from openvino.runtime.utils.data_helpers.wrappers import _InferRequestWrapper, OVDict ContainerTypes = Union[dict, list, tuple, OVDict] ScalarTypes = Union[np.number, int, float] diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py deleted file mode 100644 index e0a2d022660dd3..00000000000000 --- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - - -from openvino.utils.data_helpers.data_dispatcher import ContainerTypes -from openvino.utils.data_helpers.data_dispatcher import ScalarTypes -from openvino.utils.data_helpers.data_dispatcher import ValidKeys - -from openvino.utils.data_helpers.data_dispatcher import is_list_simple_type -from openvino.utils.data_helpers.data_dispatcher import get_request_tensor -from openvino.utils.data_helpers.data_dispatcher import value_to_tensor -from openvino.utils.data_helpers.data_dispatcher import to_c_style -from openvino.utils.data_helpers.data_dispatcher import normalize_arrays -from openvino.utils.data_helpers.data_dispatcher import create_shared -from openvino.utils.data_helpers.data_dispatcher import set_request_tensor -from openvino.utils.data_helpers.data_dispatcher import update_tensor -from openvino.utils.data_helpers.data_dispatcher import update_inputs -from openvino.utils.data_helpers.data_dispatcher import create_copied -from openvino.utils.data_helpers.data_dispatcher import _data_dispatch diff --git a/src/bindings/python/src/openvino/utils/data_helpers/wrappers.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers.py similarity index 100% rename from src/bindings/python/src/openvino/utils/data_helpers/wrappers.py rename to src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers.py diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py deleted file mode 100644 index 22214fd24682da..00000000000000 --- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - - -from openvino.utils.data_helpers.wrappers import tensor_from_file -from openvino.utils.data_helpers.wrappers import _InferRequestWrapper -from openvino.utils.data_helpers.wrappers import OVDict diff --git a/src/bindings/python/src/openvino/utils/decorators.py b/src/bindings/python/src/openvino/runtime/utils/decorators.py similarity index 98% rename from src/bindings/python/src/openvino/utils/decorators.py rename to src/bindings/python/src/openvino/runtime/utils/decorators.py index 9418c359d129e8..98da1ba4389ef7 100644 --- a/src/bindings/python/src/openvino/utils/decorators.py +++ b/src/bindings/python/src/openvino/runtime/utils/decorators.py @@ -6,8 +6,8 @@ from inspect import signature from typing import Any, Callable, Dict, Optional, Union, get_origin, get_args -from openvino import Node, Output -from openvino.utils.types import NodeInput, as_node, as_nodes +from openvino.runtime import Node, Output +from openvino.runtime.utils.types import NodeInput, as_node, as_nodes def _get_name(**kwargs: Any) -> Node: diff --git a/src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py b/src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py deleted file mode 100644 index bb0bac112d2c5f..00000000000000 --- a/src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from openvino.utils.decorators import _get_name -from openvino.utils.decorators import _set_node_friendly_name -from openvino.utils.decorators import nameable_op -from openvino.utils.decorators import unary_op -from openvino.utils.decorators import binary_op -from openvino.utils.decorators import custom_preprocess_function -from openvino.utils.decorators import MultiMethod -from openvino.utils.decorators import registry -from openvino.utils.decorators import overloading diff --git a/src/bindings/python/src/openvino/utils/input_validation.py b/src/bindings/python/src/openvino/runtime/utils/input_validation.py similarity index 98% rename from src/bindings/python/src/openvino/utils/input_validation.py rename to src/bindings/python/src/openvino/runtime/utils/input_validation.py index 1de08452e1da9f..e79a16c48581b1 100644 --- a/src/bindings/python/src/openvino/utils/input_validation.py +++ b/src/bindings/python/src/openvino/runtime/utils/input_validation.py @@ -9,7 +9,7 @@ import numpy as np -from openvino.exceptions import UserInputError +from openvino.runtime.exceptions import UserInputError log = logging.getLogger(__name__) diff --git a/src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py b/src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py deleted file mode 100644 index 0b49e9ea33c40d..00000000000000 --- a/src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from openvino.utils.input_validation import assert_list_of_ints -from openvino.utils.input_validation import _check_value -from openvino.utils.input_validation import check_valid_attribute -from openvino.utils.input_validation import check_valid_attributes -from openvino.utils.input_validation import is_positive_value -from openvino.utils.input_validation import is_non_negative_value diff --git a/src/bindings/python/src/openvino/utils/node_factory.py b/src/bindings/python/src/openvino/runtime/utils/node_factory.py similarity index 92% rename from src/bindings/python/src/openvino/utils/node_factory.py rename to src/bindings/python/src/openvino/runtime/utils/node_factory.py index e999ae6988814a..25daf739223dba 100644 --- a/src/bindings/python/src/openvino/utils/node_factory.py +++ b/src/bindings/python/src/openvino/runtime/utils/node_factory.py @@ -2,16 +2,17 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import logging as log -from functools import singledispatchmethod +from functools import partial, singledispatchmethod from typing import Any, Dict, List, Optional, Union from pathlib import Path from openvino._pyopenvino import NodeFactory as _NodeFactory -from openvino import Node, Output, Extension +from openvino.runtime import Node, Output, Extension -from openvino.exceptions import UserInputError +from openvino.runtime.exceptions import UserInputError DEFAULT_OPSET = "opset13" @@ -124,11 +125,3 @@ def _arguments_as_outputs(arguments: List[Union[Node, Output]]) -> List[Output]: else: outputs.extend(argument.outputs()) return outputs - - -def _get_node_factory(opset_version: Optional[str] = None) -> NodeFactory: - """Return NodeFactory configured to create operators from specified opset version.""" - if opset_version: - return NodeFactory(opset_version) - else: - return NodeFactory() diff --git a/src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py b/src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py deleted file mode 100644 index 945ea8deb7863c..00000000000000 --- a/src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from openvino.utils.node_factory import NodeFactory diff --git a/src/bindings/python/src/openvino/utils/reduction.py b/src/bindings/python/src/openvino/runtime/utils/reduction.py similarity index 95% rename from src/bindings/python/src/openvino/utils/reduction.py rename to src/bindings/python/src/openvino/runtime/utils/reduction.py index e6be6d0ac9a104..71d0af8de7376e 100644 --- a/src/bindings/python/src/openvino/utils/reduction.py +++ b/src/bindings/python/src/openvino/runtime/utils/reduction.py @@ -4,7 +4,7 @@ from typing import Iterable, Optional -from openvino import Node +from openvino.runtime import Node def get_reduction_axes(node: Node, reduction_axes: Optional[Iterable[int]]) -> Iterable[int]: diff --git a/src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py b/src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py deleted file mode 100644 index a2fbff9e793dca..00000000000000 --- a/src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from openvino.utils.reduction import get_reduction_axes diff --git a/src/bindings/python/src/openvino/utils/types.py b/src/bindings/python/src/openvino/runtime/utils/types.py similarity index 97% rename from src/bindings/python/src/openvino/utils/types.py rename to src/bindings/python/src/openvino/runtime/utils/types.py index b3543739741d94..52f1faf8e1e839 100644 --- a/src/bindings/python/src/openvino/utils/types.py +++ b/src/bindings/python/src/openvino/runtime/utils/types.py @@ -9,9 +9,9 @@ import numpy as np -from openvino.exceptions import OVTypeError -from openvino import Node, Shape, Output, Type -from openvino.op import Constant +from openvino.runtime.exceptions import OVTypeError +from openvino.runtime import Node, Shape, Output, Type +from openvino.runtime.op import Constant log = logging.getLogger(__name__) diff --git a/src/bindings/python/src/openvino/runtime/utils/types/__init__.py b/src/bindings/python/src/openvino/runtime/utils/types/__init__.py deleted file mode 100644 index 4f88d609988e8d..00000000000000 --- a/src/bindings/python/src/openvino/runtime/utils/types/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from openvino.utils.types import TensorShape -from openvino.utils.types import NumericData -from openvino.utils.types import NumericType -from openvino.utils.types import ScalarData -from openvino.utils.types import NodeInput - -from openvino.utils.types import openvino_to_numpy_types_map -from openvino.utils.types import openvino_to_numpy_types_str_map -from openvino.utils.types import get_element_type -from openvino.utils.types import get_element_type_str -from openvino.utils.types import get_dtype -from openvino.utils.types import get_numpy_ctype -from openvino.utils.types import get_ndarray -from openvino.utils.types import get_shape -from openvino.utils.types import make_constant_node -from openvino.utils.types import as_node -from openvino.utils.types import as_nodes diff --git a/src/bindings/python/src/openvino/package_utils.py b/src/bindings/python/src/openvino/utils.py similarity index 97% rename from src/bindings/python/src/openvino/package_utils.py rename to src/bindings/python/src/openvino/utils.py index 6aa3f3ed39b556..9890ae9b3e6460 100644 --- a/src/bindings/python/src/openvino/package_utils.py +++ b/src/bindings/python/src/openvino/utils.py @@ -21,9 +21,9 @@ def _add_openvino_libs_to_search_path() -> None: if os.path.isdir(os.path.join(os.path.dirname(__file__), "libs")): # looking for the libs in the pip installation path. openvino_libs.append(os.path.join(os.path.dirname(__file__), "libs")) - elif os.path.isdir(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "Library", "bin")): + elif os.path.isdir(os.path.join(os.path.dirname(__file__), "..", "..", "..", "Library", "bin")): # looking for the libs in the conda installation path - openvino_libs.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "Library", "bin")) + openvino_libs.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "Library", "bin")) else: # setupvars.bat script set all libs paths to OPENVINO_LIB_PATHS environment variable. openvino_libs_installer = os.getenv("OPENVINO_LIB_PATHS") diff --git a/src/bindings/python/src/openvino/utils/__init__.py b/src/bindings/python/src/openvino/utils/__init__.py deleted file mode 100644 index 2ccc79d20cce84..00000000000000 --- a/src/bindings/python/src/openvino/utils/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -"""Generic utilities. Factor related functions out to separate files.""" - -from openvino._pyopenvino.util import numpy_to_c, replace_node, replace_output_update_name - -from openvino.package_utils import get_cmake_path -from openvino.package_utils import deprecated -from openvino.package_utils import classproperty -from openvino.package_utils import deprecatedclassproperty diff --git a/src/bindings/python/src/openvino/utils/data_helpers/__init__.py b/src/bindings/python/src/openvino/utils/data_helpers/__init__.py deleted file mode 100644 index 282547dd9df79a..00000000000000 --- a/src/bindings/python/src/openvino/utils/data_helpers/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from openvino.utils.data_helpers.data_dispatcher import _data_dispatch -from openvino.utils.data_helpers.wrappers import tensor_from_file -from openvino.utils.data_helpers.wrappers import _InferRequestWrapper -from openvino.utils.data_helpers.wrappers import OVDict diff --git a/src/common/transformations/include/transformations/common_optimizations/sdpa_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/sdpa_fusion.hpp new file mode 100644 index 00000000000000..84383b777604ea --- /dev/null +++ b/src/common/transformations/include/transformations/common_optimizations/sdpa_fusion.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +/// This pass transforms the following sub-graph to a single Scaled Dot Product Attention operation. +/// Before: +/// ┌───────┐ ┌───────┐ ┌───────┐ +/// │ Q │ │ K │ │ V │ +/// └───┬───┘ └───┬───┘ └───┬───┘ +/// │ │ │ +/// │ │ │ +/// ┌───┴───┐ ┌─────┴──────┐ │ +/// │ MatMul│<──│ Transpose │ │ +/// └───┬───┘ | (Optional) │ │ +/// │ └────────────┘ │ +/// ┌───┴───┐ ┌─────────────┐ │ +/// │ Add │<───│AttentionMask│ │ +/// └───┬───┘ | (Optional) │ │ +/// │ └─────────────┘ │ +/// ┌───┴───┐ │ +/// │Softmax│ │ +/// └───┬───┘ │ +/// │ │ +/// ┌───┴───┐ │ +/// │ MatMul│<─────────────────────┘ +/// └───┬───┘ +/// ┌───┴───┐ +/// │ Output│ +/// └───────┘ +/// +/// After: +/// ┌───────┐ ┌───────┐ ┌───────┐ ┌─────────────┐ +/// │ Q │ │ K │ │ V │ │AttentionMask│ +/// └───┬───┘ └───┬───┘ └───┬───┘ └──────┬──────┘ +/// │ │ │ │ +/// │ │ │ │ +/// ┌───┴────────────┴────────────┴───────────────┴─┐ +/// │ ScaledDotProductAttention │ +/// └────────────────────┬──────────────────────────┘ +/// │ +/// │ +/// ┌────┴────┐ +/// │ Output │ +/// └─────────┘ +class TRANSFORMATIONS_API SDPAFusion : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("SDPAFusion", "0"); + SDPAFusion(); +}; + +} // namespace pass +} // namespace ov diff --git a/src/common/transformations/include/transformations/common_optimizations/sdpa_scale_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/sdpa_scale_fusion.hpp new file mode 100644 index 00000000000000..cae0363e785f4e --- /dev/null +++ b/src/common/transformations/include/transformations/common_optimizations/sdpa_scale_fusion.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +/// Merges explicit multiplication by scalar value for Q and K into scale attribute of SDPA op +/// Before: +/// ┌───────┐ ┌───────┐ ┌───────┐ ┌─────────────┐ ┌─────────────┐ +/// │ Q │ │ K │ │ V │ │AttentionMask│ │ Scale | +/// └───┬───┘ └───┬───┘ └───┬───┘ │ (Optional) │ │ (Optional) │ +/// │ │ │ └──────┬──────┘ └───────┬─────┘ +/// │ │ │ │ | +/// ┌───┴───┐ ┌───┴───┐ │ │ | +/// │ Mul | │ Mul │ | │ | +/// └───┬───┘ └───┬───┘ │ │ │ +/// │ │ │ │ │ +/// | │ │ │ │ +/// ┌───┴────────────┴────────────┴─────────────┴─┐ | +/// │ ScaledDotProductAttention │──────────────────┘ +/// └────────────────────┬────────────────────────┘ +/// │ +/// │ +/// ┌────┴────┐ +/// │ Output │ +/// └─────────┘ +/// After: +/// ┌───────┐ ┌───────┐ ┌───────┐ ┌─────────────┐ ┌───────┐ +/// │ Q │ │ K │ │ V │ │AttentionMask│ │ Scale | +/// └───┬───┘ └───┬───┘ └───┬───┘ └──────┬──────┘ └───┬───┘ +/// │ │ │ │ | +/// │ │ │ │ | +/// | │ │ │ | +/// ┌───┴────────────┴────────────┴─────────────┴─┐ | +/// │ ScaledDotProductAttention │───────────┘ +/// └────────────────────┬────────────────────────┘ +/// │ +/// │ +/// ┌────┴────┐ +/// │ Output │ +/// └─────────┘ +/// Multiply ops for Q and K are eliminated in the following cases: +/// 1. Q_scale and K_scale are constant +/// 2. Q_scale * SDPA_Scale == 1 or K_scale * SDPA_Scale == 1 +class TRANSFORMATIONS_API SDPAScaleFusion : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("SDPAScaleFusion", "0"); + SDPAScaleFusion(); +}; + +} // namespace pass +} // namespace ov diff --git a/src/common/transformations/include/transformations/sdpa_to_paged_attention/position_ids_replacer.hpp b/src/common/transformations/include/transformations/sdpa_to_paged_attention/position_ids_replacer.hpp index 50c0ecd20e76af..825ce8acbd7998 100644 --- a/src/common/transformations/include/transformations/sdpa_to_paged_attention/position_ids_replacer.hpp +++ b/src/common/transformations/include/transformations/sdpa_to_paged_attention/position_ids_replacer.hpp @@ -15,6 +15,7 @@ namespace ov { namespace pass { class TRANSFORMATIONS_API PositionIDsReplacer; +class TRANSFORMATIONS_API PositionIDsReplacerQwen; } // namespace pass } // namespace ov @@ -24,3 +25,22 @@ class ov::pass::PositionIDsReplacer : public ov::pass::MatcherPass { OPENVINO_MATCHER_PASS_RTTI("PositionIDsReplacer"); explicit PositionIDsReplacer(const Output& position_ids); }; + +/** + * @brief Qwen model expects data processing in order, the "position ids" input is detached and + * is not explicitly used in the model. The model uses implicitly defined "position ids" based + * on the past KV cache size. + * + * To use this model in Continuous batching mode, we need to apply position_ids and + * use the corresponding rotary_emb_cos/rotary_emb_sin. + * For this, we replace + * rotary_emb_cos/rotary_emb_sin -> Slice -> Slice + * With + * rotary_emb_cos/rotary_emb_sin -> Gather(by position_ids) + * Which enables applying RoPE for each token independently of their order in the input tensor. + */ +class ov::pass::PositionIDsReplacerQwen : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("PositionIDsReplacerQwen"); + explicit PositionIDsReplacerQwen(const Output& position_ids); +}; diff --git a/src/common/transformations/include/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp b/src/common/transformations/include/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp index f5497207eb4e17..d1cc5d5126cd67 100644 --- a/src/common/transformations/include/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp +++ b/src/common/transformations/include/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp @@ -4,7 +4,6 @@ #pragma once -#include "openvino/cc/pass/itt.hpp" #include "openvino/op/shape_of.hpp" #include "openvino/op/subtract.hpp" #include "openvino/pass/matcher_pass.hpp" @@ -22,6 +21,8 @@ class TRANSFORMATIONS_API PrevSequenceLengthPattern; class ov::pass::PrevSequenceLengthPattern : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("PrevSequenceLengthPattern"); - explicit PrevSequenceLengthPattern(std::shared_ptr prev_max_seq_len, std::shared_ptr batch_dim); + OPENVINO_MATCHER_PASS_RTTI("PrevSequenceLengthPattern", "0"); + explicit PrevSequenceLengthPattern(const std::shared_ptr& unsqueezed_input_ids, + const std::shared_ptr& max_context_len, + const std::shared_ptr& position_ids); }; diff --git a/src/common/transformations/include/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp b/src/common/transformations/include/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp index b5ecb96fa95198..2456161ea80a78 100644 --- a/src/common/transformations/include/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp +++ b/src/common/transformations/include/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp @@ -15,6 +15,7 @@ namespace ov { namespace pass { class TRANSFORMATIONS_API TotalSequenceLengthPattern; +class TRANSFORMATIONS_API TotalSequenceLengthPatternQwen; } // namespace pass } // namespace ov @@ -24,3 +25,22 @@ class ov::pass::TotalSequenceLengthPattern : public ov::pass::MatcherPass { OPENVINO_MATCHER_PASS_RTTI("TotalSequenceLengthPattern"); explicit TotalSequenceLengthPattern(const std::shared_ptr& max_context_len); }; + +/** + * @brief Qwen model has a specific pattern for TotalSequenceLen place detection. + * + * common pattern: Add (PrevSeqLen, CurrentSeqLen) + * + * The CurrentSeqLen is presented in this form: + * CurrentSeqLen: Parameter(name: input_ids) -> ShapeOf -> Gather + * + * Before applying this transformation, we already detected the PrevSeqLen place in the PrevSequenceLengthPattern + * and replaced it with the next subgraph: + * PrevSeqLen: Subtract (in: Parameter(name: max_context_len), in: CurrentSeqLen) + * + **/ +class ov::pass::TotalSequenceLengthPatternQwen : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("TotalSequenceLengthPattern", "0"); + explicit TotalSequenceLengthPatternQwen(const std::shared_ptr& max_context_len); +}; diff --git a/src/common/transformations/include/transformations/utils/gen_pattern.hpp b/src/common/transformations/include/transformations/utils/gen_pattern.hpp index 21309e339c959c..976561b4844a17 100644 --- a/src/common/transformations/include/transformations/utils/gen_pattern.hpp +++ b/src/common/transformations/include/transformations/utils/gen_pattern.hpp @@ -539,6 +539,11 @@ class AttrSetter : public ov::AttributeVisitor { a->set(m_attr_map[name].as_vector()); } else if (auto a = ov::as_type>(&adapter)) { a->set(m_attr_map[name].as_T_vector()); + } else if (auto a = dynamic_cast>*>(&adapter)) { + ov::op::util::VariableInfo var_info; + var_info.variable_id = m_attr_map[name].as_string(); + auto variable = std::make_shared(var_info); + a->set(variable); } else { OPENVINO_THROW("unsupported AttributeAdapter for attribute : ", name); } @@ -896,6 +901,7 @@ struct PatternNode { // scalar constant (treated as wildcard for single-element-constant with any rank) PatternNode(int v) : node(std::make_shared(element::from(), Shape({}), v)) {} PatternNode(float v) : node(std::make_shared(element::from(), Shape({}), v)) {} + PatternNode(long long v) : node(std::make_shared(element::from(), Shape({}), v)) {} PatternNode(std::initializer_list v, values_info vi = nullptr) { node = ConstVector(std::vector(v), vi); diff --git a/src/common/transformations/include/transformations/utils/print_model.hpp b/src/common/transformations/include/transformations/utils/print_model.hpp index 0829cd7e320e88..53fa7de51c5eca 100644 --- a/src/common/transformations/include/transformations/utils/print_model.hpp +++ b/src/common/transformations/include/transformations/utils/print_model.hpp @@ -19,6 +19,7 @@ #include "openvino/core/model.hpp" #include "openvino/core/node.hpp" #include "openvino/op/constant.hpp" +#include "openvino/op/util/multi_subgraph_base.hpp" #include "openvino/pass/pass.hpp" #include "transformations/utils/utils.hpp" diff --git a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp index 185ae84ec83642..23fbf882024bdc 100644 --- a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp @@ -65,6 +65,7 @@ #include "transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.hpp" #include "transformations/common_optimizations/reshape_sequence_fusion.hpp" #include "transformations/common_optimizations/ric_fusion.hpp" +#include "transformations/common_optimizations/sdpa_fusion.hpp" #include "transformations/common_optimizations/select_with_one_value_condition.hpp" #include "transformations/common_optimizations/sequence_fusion.hpp" #include "transformations/common_optimizations/shared_ops_optimization.hpp" @@ -229,6 +230,7 @@ bool ov::pass::MOCTransformations::run_on_model(const std::shared_ptr ADD_MATCHER(common_fusions, ConvertTensorIteratorToSequence) ADD_MATCHER(common_fusions, SplitConcatPairToInterpolateFusion, m_use_shapes) ADD_MATCHER(common_fusions, ConvolutionToGroupConvolutionFusion) + ADD_MATCHER(common_fusions, SDPAFusion) if (m_use_shapes) { ADD_MATCHER(common_fusions, NearestNeighborUpsamplingFusion) } diff --git a/src/common/transformations/src/transformations/common_optimizations/sdpa_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/sdpa_fusion.cpp new file mode 100644 index 00000000000000..fc581580f70001 --- /dev/null +++ b/src/common/transformations/src/transformations/common_optimizations/sdpa_fusion.cpp @@ -0,0 +1,127 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/sdpa_fusion.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/core/type.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/op/softmax.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/unsqueeze.hpp" +#include "openvino/pass/pattern/op/optional.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/gen_pattern.hpp" + +namespace ov { +namespace pass { + +SDPAFusion::SDPAFusion() { + using namespace ov::pass::pattern; + using namespace ov::gen_pattern; + + auto q = makePattern(ov::Rank(4)); + auto k = makePattern(ov::Rank(4)); + auto v = makePattern(ov::Rank(4)); + auto mask = makePattern(); + + auto k_transpose_order = pattern::wrap_type([](const Output& node) { + auto axis_order = + std::dynamic_pointer_cast(node.get_node_shared_ptr())->cast_vector(); + return axis_order == std::vector{0, 1, 3, 2}; + }); + + auto k_t = pattern::wrap_type({k, k_transpose_order}); + auto qk_nn = makePattern({q, k_t}, {{"transpose_a", false}, {"transpose_b", false}}); + auto qk_nt = makePattern({q, k}, {{"transpose_a", false}, {"transpose_b", true}}); + auto qk = qk_nt | qk_nn; + auto optional_add_mask = optional({qk, mask}); + auto softmax = makePattern({optional_add_mask}, {{"axis", "-1"}}); + auto qkv = makePattern({softmax, v}, {{"transpose_a", false}, {"transpose_b", false}}); + + auto valid_qk_shapes = [](const std::shared_ptr& qk_matmul) { + auto q_pshape = qk_matmul->get_input_partial_shape(0); + auto k_pshape = qk_matmul->get_input_partial_shape(1); + + const size_t q_head_size_idx = 3; + const size_t k_head_size_idx = qk_matmul->get_transpose_b() ? 3 : 2; + + return q_pshape.size() == 4 && k_pshape.size() == 4 && q_pshape[q_head_size_idx].is_static() && + k_pshape[k_head_size_idx].is_static() && + q_pshape[q_head_size_idx].get_length() == k_pshape[k_head_size_idx].get_length(); + }; + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + if (transformation_callback(m.get_match_root())) { + return false; + } + + auto q_node = pattern_map.at(q); + auto k_node = pattern_map.at(k); + auto v_node = pattern_map.at(v); + + if (!valid_qk_shapes(ov::as_type_ptr(pattern_map.at(qk).get_node_shared_ptr()))) { + return false; + } + + if (pattern_map.at(qk).get_target_inputs().size() > 1 || + pattern_map.at(softmax).get_target_inputs().size() > 1) { + return false; + } + if (pattern_map.count(optional_add_mask) && (pattern_map.at(optional_add_mask).get_target_inputs().size() > 1 || + pattern_map.at(mask).get_partial_shape().size() > 4)) { + return false; + } + + Output mask_value; + Output mask_input; + if (pattern_map.find(optional_add_mask) != pattern_map.end()) { + mask_value = pattern_map.at(mask); + } else { + mask_value = ov::op::v0::Constant::create(q_node.get_element_type(), ov::Shape{}, std::vector{0}); + } + + if (mask_value.get_partial_shape().size() > 4) { + return false; + } + + if (mask_value.get_partial_shape().rank() == 0 || mask_value.get_partial_shape().rank() == 4) { + mask_input = mask_value; + } else { + size_t rank_diff = q_node.get_partial_shape().size() - mask_value.get_partial_shape().size(); + std::vector axes(rank_diff); + std::iota(axes.begin(), axes.end(), 0); + mask_input = std::make_shared( + mask_value, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank_diff}, axes)); + } + + std::shared_ptr scale_node = + ov::op::v0::Constant::create(q_node.get_element_type(), ov::Shape{}, std::vector{1.0f}); + + std::shared_ptr sdpa = std::make_shared(q_node, + k_node, + v_node, + mask_input, + scale_node, + false); + + sdpa->set_friendly_name(m.get_match_root()->get_friendly_name()); + ov::copy_runtime_info(m.get_matched_nodes(), sdpa); + ov::replace_node(m.get_match_root(), sdpa); + + return true; + }; + + auto m = std::make_shared(qkv, "SDPAFusion"); + this->register_matcher(m, callback); +} + +} // namespace pass +} // namespace ov diff --git a/src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp new file mode 100644 index 00000000000000..3d750fe38a868e --- /dev/null +++ b/src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/sdpa_scale_fusion.hpp" + +#include + +#include "openvino/core/node.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/core/type.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/pass/pattern/op/optional.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "transformations/utils/gen_pattern.hpp" + +namespace ov { +namespace pass { + +SDPAScaleFusion::SDPAScaleFusion() { + using namespace ov::pass::pattern; + using namespace ov::gen_pattern; + + auto q = makePattern(ov::Rank(4)); + auto k = makePattern(ov::Rank(4)); + auto v = makePattern(ov::Rank(4)); + auto mask = makePattern(); + auto sdpa_scale = makeConst({}); + auto scale_q = makePattern("[]") | makePattern("[1]"); + auto scale_k = makePattern("[]") | makePattern("[1]"); + + auto scaled_q = optional({q, scale_q}); + auto scaled_k = optional({k, scale_k}); + auto sdpa_mask_scale = + makePattern({scaled_q, scaled_k, v, mask, sdpa_scale}, + {{"causal", false}}); + auto sdpa_mask = + makePattern({scaled_q, scaled_k, v, mask}, {{"causal", false}}); + auto sdpa_simple = + makePattern({scaled_q, scaled_k, v}, {{"causal", false}}); + auto sdpa = sdpa_simple | sdpa_mask | sdpa_mask_scale; + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + if (transformation_callback(m.get_match_root())) { + return false; + } + + auto sdpa = m.get_match_root(); + + const bool has_q_scale = pattern_map.count(scaled_q); + const bool has_k_scale = pattern_map.count(scaled_k); + + // Nothing to do + if (!has_q_scale && !has_k_scale) + return false; + + auto prev_scale_value = 1.0f; + auto scale_q_value = 1.0f; + auto scale_k_value = 1.0f; + auto scale_et = sdpa->get_output_element_type(0); + + Output q_input = sdpa->get_input_source_output(0); + Output k_input = sdpa->get_input_source_output(1); + + std::shared_ptr scale_q_node = nullptr; + std::shared_ptr scale_k_node = nullptr; + + if (pattern_map.find(sdpa_scale) != pattern_map.end()) { + auto prev_scale_node = + ov::as_type_ptr(pattern_map.at(sdpa_scale).get_node_shared_ptr()); + prev_scale_value = prev_scale_node->cast_vector()[0]; + scale_et = prev_scale_node->get_output_element_type(0); + } else { + auto head_size = q_input.get_partial_shape()[3]; + if (head_size.is_dynamic()) + return false; + + prev_scale_value = 1.0f / std::sqrt(static_cast(head_size.get_length())); + } + + // Extract scalar scale values for Q and K if those are constant and set new inputs for SDPA + if (has_q_scale) { + scale_q_node = pattern_map.at(scale_q).get_node_shared_ptr(); + if (ov::is_type(scale_q_node)) { + scale_q_value = ov::as_type_ptr(scale_q_node)->cast_vector()[0]; + q_input = pattern_map.at(q); + } + } + if (has_k_scale) { + scale_k_node = pattern_map.at(scale_k).get_node_shared_ptr(); + if (ov::is_type(scale_k_node)) { + scale_k_value = ov::as_type_ptr(scale_k_node)->cast_vector()[0]; + k_input = pattern_map.at(k); + } + } + + Output new_scale_node; + auto new_scale_val = prev_scale_value * scale_q_value * scale_k_value; + + // If new scale is 1 and we have non-constant scale node for either Q or K, then we can make it a scale of SDPA + if (new_scale_val == 1.0f) { + if (has_q_scale && !ov::is_type(scale_q_node)) { + new_scale_node = pattern_map.at(scale_q); + q_input = pattern_map.at(q); + } else if (has_k_scale && !ov::is_type(scale_k_node)) { + new_scale_node = pattern_map.at(scale_k); + k_input = pattern_map.at(k); + } else { + new_scale_node = ov::op::v0::Constant::create(scale_et, ov::Shape{}, std::vector{new_scale_val}); + } + } else { + new_scale_node = ov::op::v0::Constant::create(scale_et, ov::Shape{}, std::vector{new_scale_val}); + } + + OutputVector new_inputs = {q_input, k_input, pattern_map.at(v)}; + if (pattern_map.find(mask) != pattern_map.end()) { + new_inputs.push_back(pattern_map.at(mask)); + } else { + new_inputs.push_back( + ov::op::v0::Constant::create(new_scale_node.get_element_type(), ov::Shape{}, std::vector{0.0f})); + } + + new_inputs.push_back(new_scale_node); + + auto new_sdpa = sdpa->clone_with_new_inputs(new_inputs); + new_sdpa->set_friendly_name(sdpa->get_friendly_name()); + ov::copy_runtime_info(sdpa, new_sdpa); + ov::replace_node(sdpa, new_sdpa); + + return true; + }; + + auto m = std::make_shared(sdpa, "SDPAScaleFusion"); + this->register_matcher(m, callback); +} + +} // namespace pass +} // namespace ov diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/position_ids_replacer.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/position_ids_replacer.cpp index a72a49fb4832eb..397746c75bb84d 100644 --- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/position_ids_replacer.cpp +++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/position_ids_replacer.cpp @@ -7,11 +7,18 @@ #include "openvino/cc/pass/itt.hpp" #include "openvino/op/gather.hpp" #include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/squeeze.hpp" +#include "openvino/op/unsqueeze.hpp" #include "openvino/pass/pattern/op/optional.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" using namespace ov::op; +using namespace ov::pass::pattern; // TODO: Instead of using the following transformation that matches quite a specific place in a model graph in case when // position_ids parameter is missing, consider replacing always existing attention_mask parameter with a sub-graph using @@ -19,25 +26,90 @@ using namespace ov::op; ov::pass::PositionIDsReplacer::PositionIDsReplacer(const Output& position_ids) { MATCHER_SCOPE(PositionIDsReplacer); - auto input_ids = pattern::any_input(); - auto input_embed = pattern::wrap_type({pattern::any_input(), input_ids, pattern::any_input()}); + auto input_ids = any_input(); + auto input_embed = wrap_type({any_input(), input_ids, any_input()}); - auto position_ids_pattern = pattern::any_input(); - auto offset = pattern::wrap_type(); - auto add_offset = pattern::wrap_type({position_ids_pattern, offset}); - auto convert = pattern::wrap_type({add_offset}); - auto position_embed = pattern::wrap_type({pattern::any_input(), convert, pattern::any_input()}); + auto position_ids_pattern = any_input(); + auto offset = wrap_type(); + auto add_offset = wrap_type({position_ids_pattern, offset}); + auto convert = wrap_type({add_offset}); + auto position_embed = wrap_type({any_input(), convert, any_input()}); - auto mul = pattern::optional({input_embed, pattern::any_input()}); + auto mul = optional({input_embed, any_input()}); - auto add = pattern::wrap_type({mul, position_embed}); + auto add = wrap_type({mul, position_embed}); - ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + ov::matcher_pass_callback callback = [=](Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); replace_node(pattern_map.at(position_ids_pattern).get_node_shared_ptr(), position_ids.get_node_shared_ptr()); return true; }; - auto m = std::make_shared(add, matcher_name); + auto m = std::make_shared(add, matcher_name); register_matcher(m, callback); -} \ No newline at end of file +} + +ov::pass::PositionIDsReplacerQwen::PositionIDsReplacerQwen(const Output& position_ids) { + MATCHER_SCOPE(PositionIDsReplacerQwen); + + auto _const = []() { + return wrap_type(); + }; + + // total seq len: + auto p_max_context_len = wrap_type(); + auto p_opt_convert = optional(p_max_context_len); + auto p_opt_reshape = optional({p_opt_convert, any_input()}); + + // current seq len: + // it might be present in 2 different ways: + // input_ids -> unsqueeze -> reshape -> convert -> shape_of -> gather + // QKV -> variadic_split(Q or K) -> rope Q/K -> shape_of -> gather + // Probably we can use the symbols to re-use one of these ways. + // Currently, "any_input" is used to detect the both places. + auto p_shape_of = wrap_type({any_input()}); + auto p_current_len = wrap_type({p_shape_of, _const(), _const()}); + + auto p_neg_const = wrap_type(); + auto p_neg_mul = wrap_type({p_current_len, p_neg_const}); + // the rotary_emb_cos/rotary_emb_sin are sliced by the total length [1,..4096,1,128] + auto p_rotary_emb_sincos = wrap_type(); + auto p_slice_1 = wrap_type({p_rotary_emb_sincos, _const(), p_opt_reshape, _const(), _const()}); + auto p_slice_2 = wrap_type({p_slice_1, p_neg_mul, _const(), _const(), _const()}); + + ov::matcher_pass_callback callback = [=](Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + auto max_context_len = pattern_map.at(p_max_context_len).get_node_shared_ptr(); + if (max_context_len->get_friendly_name() != "max_context_len") { + return false; + } + auto rotary_emb_sincos = pattern_map.at(p_rotary_emb_sincos).get_node_shared_ptr(); + auto slice_1 = pattern_map.at(p_slice_1).get_node_shared_ptr(); + auto slice_2 = pattern_map.at(p_slice_2).get_node_shared_ptr(); + + auto axis = v0::Constant::create(element::i64, Shape{}, {1}); + // in case of PagedAttention (Continuous batching) the rotary_emb_cos/rotary_emb_sin + // are used not in the sequential order, so we need to use position_ids to get the expected values. + auto gather = std::make_shared(slice_1->input_value(0), position_ids, axis); + gather->set_friendly_name(slice_2->get_friendly_name()); + gather->validate_and_infer_types(); + + auto pshape = rotary_emb_sincos->get_output_partial_shape(0); + if (pshape.rank().is_dynamic() || pshape.rank().get_length() != 4) { + return false; + } + + // PagedAttention expects the next layout for Q,K,V: + // [batch_size_in_tokens, num_kv_heads * head_size] + // so here we need to reshape the output tensor to move the seq dim (num tokens) to the batch + // num_kv_heads * head_size are already handled in the StateManagementPattern transformation + auto head_size = static_cast(pshape[3].get_length()); + auto new_shape = v0::Constant::create(element::i64, Shape{4}, std::vector{-1, 1, 1, head_size}); + auto reshape = std::make_shared(gather, new_shape, false); + replace_node(slice_2, reshape); + return true; + }; + + auto m = std::make_shared(p_slice_2, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.cpp index 36d9d88975b2e0..55d7af822c3857 100644 --- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.cpp +++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.cpp @@ -14,8 +14,9 @@ using namespace ov::op; -ov::pass::PrevSequenceLengthPattern::PrevSequenceLengthPattern(std::shared_ptr prev_max_seq_len, - std::shared_ptr batch_dim) { +ov::pass::PrevSequenceLengthPattern::PrevSequenceLengthPattern(const std::shared_ptr& unsqueezed_input_ids, + const std::shared_ptr& max_context_len, + const std::shared_ptr& position_ids) { MATCHER_SCOPE(PrevSequenceLengthPattern); // The transformation addresses two cases that look similar: (1) previous sequence length, (2) batch size in // kv-cache state In first case it should replace it by prev_max_seq_len. For the second case, connect to batch_dim. @@ -40,8 +41,16 @@ ov::pass::PrevSequenceLengthPattern::PrevSequenceLengthPattern(std::shared_ptrget_output_element_type(0); std::shared_ptr replacement; if (kv_init_shape[axis].is_static() && kv_init_shape[axis].get_length() == 0) { + auto cur_seq_len = std::make_shared(std::make_shared(unsqueezed_input_ids), + v0::Constant::create(element::i64, Shape{}, {1}), + v0::Constant::create(element::i64, Shape{}, {0})); + auto cur_seq_len_i32 = std::make_shared(cur_seq_len, element::i32); + auto prev_max_seq_len = std::make_shared(max_context_len, cur_seq_len_i32); replacement = prev_max_seq_len; } else { + // it is not always required, so will be disposed if not needed + auto batch_dim = std::make_shared(position_ids); + // assumption that any other axis should point to batch dimension, precise reasoning is too complex // TODO: provide more reliable check replacement = batch_dim; diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp index b55c3d73316120..a36085c34237a4 100644 --- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp +++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp @@ -437,6 +437,7 @@ ov::pass::StateManagementPattern::StateManagementPattern(ParameterVector& kv_par parameters_to_remove.push_back(param); } + pa_transpose->set_friendly_name(sdpa_node->get_friendly_name()); replace_node(m.get_match_root(), pa_transpose); return true; }; diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.cpp index 18387d5ca1ae04..cbf9426a0c82c5 100644 --- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.cpp +++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.cpp @@ -6,27 +6,49 @@ #include "openvino/cc/pass/itt.hpp" #include "openvino/core/validation_util.hpp" +#include "openvino/op/add.hpp" #include "openvino/op/concat.hpp" #include "openvino/op/gather.hpp" +#include "openvino/op/reshape.hpp" #include "openvino/op/shape_of.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/unsqueeze.hpp" +#include "openvino/pass/pattern/op/optional.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" using namespace ov::op; +using namespace ov::pass::pattern; + +namespace { + +void align_replacement(std::shared_ptr& replacement, + const ov::PartialShape& required_shape, + ov::element::Type target_type) { + if (replacement->get_output_element_type(0) != target_type) { + replacement = std::make_shared(replacement, target_type); + } + + if (replacement->get_output_partial_shape(0) != required_shape && required_shape.rank().is_static()) { + replacement = ov::op::util::reshapeTo(replacement, ov::Shape(required_shape.rank().get_length(), 1)); + } +} + +} // namespace ov::pass::TotalSequenceLengthPattern::TotalSequenceLengthPattern( const std::shared_ptr& max_context_len) { MATCHER_SCOPE(TotalSequenceLengthPattern); - auto kv_past = pattern::wrap_type({pattern::any_input()}); - auto kv_gather = pattern::wrap_type({kv_past, pattern::any_input(), pattern::any_input()}); - auto kv_current = pattern::any_input(); - auto kv_concat = pattern::wrap_type({kv_gather, kv_current}); - auto kv_shape = pattern::wrap_type({kv_concat}); - auto gather_idx_label = pattern::wrap_type(); - auto seq = pattern::wrap_type({kv_shape, gather_idx_label, pattern::any_input()}); + auto kv_past = wrap_type({any_input()}); + auto kv_gather = wrap_type({kv_past, any_input(), any_input()}); + auto kv_current = any_input(); + auto kv_concat = wrap_type({kv_gather, kv_current}); + auto kv_shape = wrap_type({kv_concat}); + auto gather_idx_label = wrap_type(); + auto seq = wrap_type({kv_shape, gather_idx_label, any_input()}); - ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + ov::matcher_pass_callback callback = [=](Matcher& m) { // TODO: Check that seq has axis that really takes sequence len but not any other dimension -- // use symbolic infra or look at the constant input const auto& pattern_map = m.get_pattern_value_map(); @@ -71,16 +93,8 @@ ov::pass::TotalSequenceLengthPattern::TotalSequenceLengthPattern( if (concat_axis_to_compare == gather_idx_to_compare) { auto target_type = gather->get_output_element_type(0); - - if (replacement->get_output_element_type(0) != target_type) { - replacement = std::make_shared(replacement, target_type); - } - auto required_shape = gather->get_output_partial_shape(0); - - if (replacement->get_output_partial_shape(0) != required_shape && required_shape.rank().is_static()) { - replacement = op::util::reshapeTo(replacement, Shape(required_shape.rank().get_length(), 1)); - } + align_replacement(replacement, required_shape, target_type); } else { // TODO: change in the future when we start supporting dynamic shapes here replacement = ov::util::get_constant_from_source(gather->output(0)); @@ -94,6 +108,41 @@ ov::pass::TotalSequenceLengthPattern::TotalSequenceLengthPattern( return true; }; - auto m = std::make_shared(seq, matcher_name); + auto m = std::make_shared(seq, matcher_name); + register_matcher(m, callback); +} + +ov::pass::TotalSequenceLengthPatternQwen::TotalSequenceLengthPatternQwen( + const std::shared_ptr& max_context_len) { + MATCHER_SCOPE(TotalSequenceLengthPatternQwen); + + auto p_input_ids = wrap_type(); + auto p_unsqueeze = wrap_type({p_input_ids, any_input()}); + auto p_opt_reshape_1 = optional({p_unsqueeze, any_input()}); + auto p_opt_convert_1 = optional(p_opt_reshape_1); + auto p_kv_shape_current = wrap_type({p_opt_convert_1}); + auto p_seq_current = wrap_type({p_kv_shape_current, any_input(), any_input()}); + auto p_opt_convert_2 = optional(p_seq_current); + + auto p_max_context_len = wrap_type(); + auto p_prev_max_seq_len = wrap_type({p_max_context_len, any_input()}); + auto p_opt_convert_3 = optional(p_prev_max_seq_len); + auto p_opt_reshape_2 = optional({p_opt_convert_3, any_input()}); + auto p_total_seq = wrap_type({p_opt_convert_2, p_opt_reshape_2}); + + ov::matcher_pass_callback callback = [=](Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + auto total_seq = pattern_map.at(p_total_seq).get_node_shared_ptr(); + std::shared_ptr replacement = max_context_len; + + auto target_type = total_seq->get_output_element_type(0); + auto required_shape = total_seq->get_output_partial_shape(0); + align_replacement(replacement, required_shape, target_type); + + replace_node(total_seq, replacement); + return true; + }; + + auto m = std::make_shared(p_total_seq, matcher_name); register_matcher(m, callback); -} \ No newline at end of file +} diff --git a/src/common/transformations/tests/common_optimizations/sdpa_fusion_test.cpp b/src/common/transformations/tests/common_optimizations/sdpa_fusion_test.cpp new file mode 100644 index 00000000000000..52c10ba5967bd8 --- /dev/null +++ b/src/common/transformations/tests/common_optimizations/sdpa_fusion_test.cpp @@ -0,0 +1,234 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include +#include +#include +#include + +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/softmax.hpp" +#include "openvino/op/transpose.hpp" + +using namespace testing; +using namespace ov::pass; +using namespace ov; + +TEST_F(TransformationTestsF, SDPAFusionTest1) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, -1, 32}; + const PartialShape value_shape{1, 32, -1, 32}; + + const auto query = std::make_shared(element::f32, query_shape); + const auto key = std::make_shared(element::f32, key_shape); + const auto value = std::make_shared(element::f32, value_shape); + const auto casual = false; + { + const auto qk = std::make_shared(query, key, false, true); + const auto softmax = std::make_shared(qk, -1); + const auto qkv = std::make_shared(softmax, value, false, false); + + model = std::make_shared(NodeVector{qkv}, ParameterVector{query, key, value}); + manager.register_pass(); + } + + { + const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{1.0f}); + const auto mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{0.0f}); + const auto sdpa = std::make_shared(query, + key, + value, + mask_const, + scale_const, + casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAFusionTest2) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, -1, 32}; + const PartialShape value_shape{1, 32, -1, 32}; + + const auto query = std::make_shared(element::f16, query_shape); + const auto key = std::make_shared(element::f16, key_shape); + const auto value = std::make_shared(element::f16, value_shape); + const auto casual = false; + { + const auto qk = std::make_shared(query, key, false, true); + const auto softmax = std::make_shared(qk, -1); + const auto qkv = std::make_shared(softmax, value, false, false); + + model = std::make_shared(NodeVector{qkv}, ParameterVector{query, key, value}); + manager.register_pass(); + } + + { + const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector{1.0f}); + const auto mask_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector{0.0f}); + const auto sdpa = std::make_shared(query, + key, + value, + mask_const, + scale_const, + casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAFusionTest3) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, -1, 32}; + const PartialShape value_shape{1, 32, -1, 32}; + + const auto query = std::make_shared(element::f16, query_shape); + const auto key = std::make_shared(element::f16, key_shape); + const auto value = std::make_shared(element::f16, value_shape); + const auto casual = false; + { + const auto key_t = + std::make_shared(key, + op::v0::Constant::create(element::i64, Shape{4}, {0, 1, 3, 2})); + const auto qk = std::make_shared(query, key_t, false, false); + const auto softmax = std::make_shared(qk, -1); + const auto qkv = std::make_shared(softmax, value, false, false); + + model = std::make_shared(NodeVector{qkv}, ParameterVector{query, key, value}); + manager.register_pass(); + } + + { + const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector{1.0f}); + const auto mask_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector{0.0f}); + const auto sdpa = std::make_shared(query, + key, + value, + mask_const, + scale_const, + casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAFusionTest4) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, 32, -1}; + const PartialShape value_shape{1, 32, -1, 32}; + + const auto query = std::make_shared(element::f16, query_shape); + const auto key = std::make_shared(element::f16, key_shape); + const auto value = std::make_shared(element::f16, value_shape); + { + const auto qk = std::make_shared(query, key, false, false); + const auto softmax = std::make_shared(qk, -1); + const auto qkv = std::make_shared(softmax, value, false, false); + + model = std::make_shared(NodeVector{qkv}, ParameterVector{query, key, value}); + manager.register_pass(); + } + + model_ref = model->clone(); + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAFusionTest5) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, -1, 32}; + const PartialShape value_shape{1, 32, -1, 32}; + const PartialShape attention_mask_shape{1, 32, -1, -1}; + + const auto query = std::make_shared(element::f16, query_shape); + const auto key = std::make_shared(element::f16, key_shape); + const auto value = std::make_shared(element::f16, value_shape); + const auto mask = std::make_shared(element::f16, attention_mask_shape); + const auto casual = false; + { + const auto qk = std::make_shared(query, key, false, true); + const auto mask_add = std::make_shared(qk, mask); + const auto softmax = std::make_shared(mask_add, -1); + const auto qkv = std::make_shared(softmax, value, false, false); + + model = std::make_shared(NodeVector{qkv}, ParameterVector{query, key, value, mask}); + manager.register_pass(); + } + + { + const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector{1.0f}); + const auto sdpa = + std::make_shared(query, key, value, mask, scale_const, casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value, mask}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAFusionTest6) { + const PartialShape query_shape{1, 32, 10, 32}; + const PartialShape key_shape{1, 32, 10, 32}; + const PartialShape value_shape{1, 32, 10, 32}; + const PartialShape attention_mask_shape{1, 1, 10, 10}; + + const auto query = std::make_shared(element::f16, query_shape); + const auto key = std::make_shared(element::f16, key_shape); + const auto value = std::make_shared(element::f16, value_shape); + const auto mask = std::make_shared(element::f16, attention_mask_shape); + const auto casual = false; + { + const auto qk = std::make_shared(query, key, false, true); + const auto mask_add = std::make_shared(qk, mask); + const auto softmax = std::make_shared(mask_add, -1); + const auto qkv = std::make_shared(softmax, value, false, false); + + model = std::make_shared(NodeVector{qkv}, ParameterVector{query, key, value, mask}); + manager.register_pass(); + } + + { + const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector{1.0f}); + const auto sdpa = + std::make_shared(query, key, value, mask, scale_const, casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value, mask}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAFusionTest7) { + const PartialShape query_shape{1, 8, -1, 32}; + const PartialShape key_shape{-1, 1, 8, 32}; + const PartialShape value_shape{1, 8, -1, 32}; + + const auto query = std::make_shared(element::f16, query_shape); + const auto key = std::make_shared(element::f16, key_shape); + const auto value = std::make_shared(element::f16, value_shape); + { + const auto key_t = + std::make_shared(key, + op::v0::Constant::create(element::i64, Shape{4}, {1, 2, 3, 0})); + const auto qk = std::make_shared(query, key_t, false, false); + const auto softmax = std::make_shared(qk, -1); + const auto qkv = std::make_shared(softmax, value, false, false); + + model = std::make_shared(NodeVector{qkv}, ParameterVector{query, key, value}); + manager.register_pass(); + } +} diff --git a/src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp b/src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp new file mode 100644 index 00000000000000..f922f030a9c43b --- /dev/null +++ b/src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp @@ -0,0 +1,228 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include +#include +#include +#include + +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" + +using namespace testing; +using namespace ov::pass; +using namespace ov; + +TEST_F(TransformationTestsF, SDPAScaleFusionTest1) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, -1, 32}; + const PartialShape value_shape{1, 32, -1, 32}; + + const auto query = std::make_shared(element::f32, query_shape); + const auto key = std::make_shared(element::f32, key_shape); + const auto value = std::make_shared(element::f32, value_shape); + const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{8.0f}); + const auto v_scaled = std::make_shared(value, scale_const); + const auto casual = false; + { + const auto q_scaled = std::make_shared(query, scale_const); + const auto k_scaled = std::make_shared(key, scale_const); + const auto sdpa = + std::make_shared(q_scaled, k_scaled, v_scaled, casual); + + model = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value}); + manager.register_pass(); + } + + { + const auto new_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{0.0f}); + const auto new_scale_const = + ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{64.0f / std::sqrt(32.0f)}); + const auto sdpa = std::make_shared(query, + key, + v_scaled, + new_mask_const, + new_scale_const, + casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAScaleFusionTest2) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, -1, 32}; + const PartialShape value_shape{1, 32, -1, 32}; + + const auto query = std::make_shared(element::f32, query_shape); + const auto key = std::make_shared(element::f32, key_shape); + const auto value = std::make_shared(element::f32, value_shape); + const auto sdpa_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{0.0f}); + const auto sdpa_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{2.0f}); + const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{8.0f}); + const auto v_scaled = std::make_shared(value, scale_const); + const auto casual = false; + { + const auto q_scaled = std::make_shared(query, scale_const); + const auto k_scaled = std::make_shared(key, scale_const); + const auto sdpa = std::make_shared(q_scaled, + k_scaled, + v_scaled, + sdpa_mask_const, + sdpa_scale_const, + casual); + + model = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value}); + manager.register_pass(); + } + + { + const auto new_scale_const = + ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{128.0f}); + const auto sdpa = std::make_shared(query, + key, + v_scaled, + sdpa_mask_const, + new_scale_const, + casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAScaleFusionTest3) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, -1, 32}; + const PartialShape value_shape{1, 32, -1, 32}; + + const auto query = std::make_shared(element::f32, query_shape); + const auto key = std::make_shared(element::f32, key_shape); + const auto value = std::make_shared(element::f32, value_shape); + const auto sdpa_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{0.0f}); + const auto sdpa_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{2.0f}); + const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{8.0f}); + const auto v_scaled = std::make_shared(value, scale_const); + const auto casual = false; + { + const auto q_scaled = std::make_shared(query, scale_const); + const auto sdpa = std::make_shared(q_scaled, + key, + v_scaled, + sdpa_mask_const, + sdpa_scale_const, + casual); + + model = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value}); + manager.register_pass(); + } + + { + const auto new_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{16.0f}); + const auto sdpa = std::make_shared(query, + key, + v_scaled, + sdpa_mask_const, + new_scale_const, + casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAScaleFusionTest4) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, -1, 32}; + const PartialShape value_shape{1, 32, -1, 32}; + + const auto query = std::make_shared(element::f32, query_shape); + const auto key = std::make_shared(element::f32, key_shape); + const auto value = std::make_shared(element::f32, value_shape); + const auto sdpa_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{0.0f}); + const auto sdpa_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{2.0f}); + const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{8.0f}); + const auto scale_dyn = std::make_shared(element::f32, ov::Shape{}); + const auto v_scaled = std::make_shared(value, scale_const); + const auto casual = false; + const auto q_scaled = std::make_shared(query, scale_dyn); + { + const auto k_scaled = std::make_shared(key, scale_const); + const auto sdpa = std::make_shared(q_scaled, + k_scaled, + v_scaled, + sdpa_mask_const, + sdpa_scale_const, + casual); + + model = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value, scale_dyn}); + manager.register_pass(); + } + + { + const auto new_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{16.0f}); + const auto sdpa = std::make_shared(q_scaled, + key, + v_scaled, + sdpa_mask_const, + new_scale_const, + casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value, scale_dyn}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, SDPAScaleFusionTest5) { + const PartialShape query_shape{1, 32, -1, 32}; + const PartialShape key_shape{1, 32, -1, 32}; + const PartialShape value_shape{1, 32, -1, 32}; + + const auto query = std::make_shared(element::f32, query_shape); + const auto key = std::make_shared(element::f32, key_shape); + const auto value = std::make_shared(element::f32, value_shape); + const auto sdpa_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{0.0f}); + const auto sdpa_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{1.0f}); + const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector{1.0f}); + const auto scale_dyn = std::make_shared(element::f32, ov::Shape{}); + const auto v_scaled = std::make_shared(value, scale_const); + const auto casual = false; + { + const auto q_scaled = std::make_shared(query, scale_dyn); + const auto k_scaled = std::make_shared(key, scale_const); + const auto sdpa = std::make_shared(q_scaled, + k_scaled, + v_scaled, + sdpa_mask_const, + sdpa_scale_const, + casual); + + model = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value, scale_dyn}); + manager.register_pass(); + } + + { + const auto sdpa = std::make_shared(query, + key, + v_scaled, + sdpa_mask_const, + scale_dyn, + casual); + model_ref = std::make_shared(NodeVector{sdpa}, ParameterVector{query, key, value, scale_dyn}); + } + + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} diff --git a/src/common/transformations/tests/op_conversions/sdpa_to_paged_attention_test.cpp b/src/common/transformations/tests/op_conversions/sdpa_to_paged_attention_test.cpp new file mode 100644 index 00000000000000..840309993c939a --- /dev/null +++ b/src/common/transformations/tests/op_conversions/sdpa_to_paged_attention_test.cpp @@ -0,0 +1,618 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/pass/sdpa_to_paged_attention.hpp" + +#include + +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/core/model.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/divide.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/ops.hpp" +#include "openvino/op/paged_attention.hpp" +#include "openvino/op/power.hpp" +#include "openvino/op/reduce_mean.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/op/select.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/sqrt.hpp" +#include "openvino/op/squeeze.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/unsqueeze.hpp" +#include "transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp" +#include "transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp" +#include "transformations/utils/gen_pattern.hpp" +#include "transformations/utils/print_model.hpp" + +using namespace ov; +using namespace std; +using namespace testing; +using namespace ov::op; +using namespace ov::gen_pattern; + +namespace { + +// Constants and Parameters attributes: +auto el_type_i64 = std::pair({"element_type", "i64"}); +auto el_type_i32 = std::pair({"element_type", "i32"}); +auto el_type_f32 = std::pair({"element_type", "f32"}); + +// Convert ops attributes: +auto dest_type_i64 = std::pair({"destination_type", "i64"}); +auto dest_type_f32 = std::pair({"destination_type", "f32"}); +auto dest_type_f16 = std::pair({"destination_type", "f16"}); + +// Other attributes: +auto numpy_broadcast = std::pair({"auto_broadcast", "numpy"}); +auto special_zero_true = std::pair({"special_zero", true}); + +auto single_val = [](int rank, float val) { + return makeConst(element::f32, ov::Shape{std::vector(rank, 1)}, {val}); +}; + +ov::ParameterVector nodes_to_params(const ov::NodeVector& node_vec) { + ov::ParameterVector params; + params.reserve(node_vec.size()); + for (const auto& node : node_vec) { + params.push_back(ov::as_type_ptr(node)); + } + return params; +} + +enum QKV : int { Q = 0, K = 1, V = 2 }; +vector MOCK_VALUE = {1}; + +// original weights = 151936, attention_weights = 12288 +#define WEIGHTS 1024 +#define ATTENTION_WEIGHTS 512 + +class Qwen7bChatSDPA { +public: + static std::shared_ptr gen_embeddings(const std::shared_ptr& input_ids) { + auto view_reshape = makeOP({input_ids, {-1, 0}}, {special_zero_true}); + auto input_ids_i64 = makeOP({view_reshape}, {dest_type_i64}); + + auto weights = makeConst(element::u8, {WEIGHTS, 4096}, MOCK_VALUE); + auto weights_fp16 = makeOP({weights}, {dest_type_f16}); + auto zero_point = makeConst(element::u8, {WEIGHTS, 1}, MOCK_VALUE); + auto zero_point_fp16 = makeOP({zero_point}, {dest_type_f16}); + auto zero_point_subtract = makeOP({weights_fp16, zero_point_fp16}, {numpy_broadcast}); + + auto scale = makeConst(element::f16, {WEIGHTS, 1}, MOCK_VALUE); + auto mul_scale = makeOP({zero_point_subtract, scale}, {numpy_broadcast}); + auto fq_weights = makeOP({mul_scale}, {dest_type_f32}); + + return makeOP({fq_weights, input_ids_i64, 0}, {{"batch_dims", 0}}); + } + + static std::shared_ptr gen_attention_weights() { + auto weights = makeConst(element::u8, {ATTENTION_WEIGHTS, 4096}, MOCK_VALUE); + auto weights_f16 = makeOP({weights}, {dest_type_f16}); + + auto zero_points = makeConst(element::u8, {ATTENTION_WEIGHTS, 1}, MOCK_VALUE); + auto zero_points_f16 = makeOP({zero_points}, {dest_type_f16}); + auto subtract = makeOP({weights_f16, zero_points_f16}, {numpy_broadcast}); + + auto scale = makeConst(element::f16, {ATTENTION_WEIGHTS, 1}, MOCK_VALUE); + auto mul = makeOP({subtract, scale}, {numpy_broadcast}); + return makeOP({mul}, {dest_type_f32}); + } + + static std::shared_ptr gen_qkv_proj(const std::shared_ptr& embeddings) { + auto _const_0 = single_val(/*rank*/ 3, /*val*/ 2); + auto pow = makeOP({embeddings, _const_0}, {numpy_broadcast}); + auto mean = makeOP({pow, {-1}}, {{"keep_dims", true}}); + + auto _const_1 = single_val(/*rank*/ 3, /*val*/ 1); + auto add = makeOP({mean, _const_1}, {numpy_broadcast}); + auto sqrt = makeOP({add}); + + auto _const_2 = single_val(/*rank*/ 3, /*val*/ 1); + auto div = makeOP({_const_2, sqrt}, {numpy_broadcast, {"m_pythondiv", true}}); + auto mul_0 = makeOP({embeddings, div}, {numpy_broadcast}); + + auto _const_3 = makeConst(element::f32, {1, 1, 4096}, MOCK_VALUE); + auto mul_1 = makeOP({mul_0, _const_3}, {numpy_broadcast}); + auto attention_weights = gen_attention_weights(); + auto linear_matmul = + makeOP({mul_1, attention_weights}, {{"transpose_a", false}, {"transpose_b", true}}); + + auto _const_4 = makeConst(element::f32, {1, 1, ATTENTION_WEIGHTS}, MOCK_VALUE); + auto linear_add = makeOP({linear_matmul, _const_4}, {numpy_broadcast}); + return makeOP({linear_add, 2, {4096, 4096, -1}}); + } + + static std::shared_ptr gen_cache(const std::shared_ptr& input_ids, + const std::shared_ptr& beam_idx, + const std::string& name) { + auto shape_of = makeOP({input_ids}, {{"output_type", "i64"}}); + auto gather = makeOP({shape_of, {0}, 0}, {{"batch_dims", 0}}); + auto concat = makeOP({gather, {0ll}, {32ll}, {128ll}}, {{"axis", 0}}); + auto init_to_read = makeOP({0.000000f, concat}, {{"mode", "numpy"}}); + auto cache = makeOP( + {init_to_read}, + {{"variable_id", name}, {"variable_type", "f32"}, {"variable_shape", PartialShape{DYN, DYN, 32, 128}}}); + return makeOP({cache, beam_idx, 0}, {{"batch_dims", 0}}); + } + + static std::shared_ptr gen_current_len(const std::shared_ptr& input_ids) { + auto shape_of = makeOP({input_ids}, {{"output_type", "i64"}}); + return makeOP({shape_of, {1}, 0}, {{"batch_dims", 0}}); + } + + static std::shared_ptr gen_past_len(const std::shared_ptr& k_cache) { + auto shape_of = makeOP({k_cache}, {{"output_type", "i64"}}); + return makeOP({shape_of, {1}, 0}, {{"batch_dims", 0}}); + } + + static std::shared_ptr gen_total_len(const std::shared_ptr& cur_len, + const std::shared_ptr& past_len) { + return makeOP({cur_len, past_len}, {numpy_broadcast}); + } + + static std::shared_ptr gen_rope(QKV idx, + const std::shared_ptr& qkv_proj, + const std::shared_ptr& head_size, + const std::shared_ptr& sliced_sin, + const std::shared_ptr& sliced_cos) { + auto current_k = makeOP({qkv_proj->output(idx), {0, 0, 32, 128}}, {special_zero_true}); + auto sliced_k = makeOP({current_k, {0}, head_size, {1}, {3}}); + auto mul_1 = makeOP({sliced_k, sliced_cos}, {numpy_broadcast}); + + auto reshape = makeOP({sliced_k, {0, 0, 32, 2, 64}}, {special_zero_true}); + auto split_1 = makeOP({reshape, -2}, {{"num_splits", 2}}); + auto list_unpack_1 = makeOP({split_1->output(1), -2}); + + auto _const = single_val(/*rank*/ 4, /*val*/ 1); + auto mul_2 = makeOP({list_unpack_1, _const}, {numpy_broadcast}); + auto list_unpack_2 = makeOP({split_1->output(0), -2}); + auto concat = makeOP({mul_2, list_unpack_2}, {{"axis", -1}}); + + auto mul_3 = makeOP({concat, sliced_sin}, {numpy_broadcast}); + return makeOP({mul_1, mul_3}, {numpy_broadcast}); + } + + static std::shared_ptr gen_rope_emb_sin(const std::shared_ptr& total_seq_len, + const std::shared_ptr& neg_mul, + std::shared_ptr& head_size) { + auto sin = makeConst(element::f32, {1, 4096, 1, 128}, MOCK_VALUE); + auto sliced_sin_by_total = makeOP({sin, {0}, total_seq_len, {1}, {1}}); + auto rotary_emb_sin_shape = makeOP({sliced_sin_by_total}, {{"output_type", "i64"}}); + head_size = makeOP({rotary_emb_sin_shape, {3}, 0}, {{"batch_dims", 0}}); + return makeOP({sliced_sin_by_total, neg_mul, {LLONG_MAX}, {1}, {1}}); + } + + static std::shared_ptr gen_rope_emb_cos(const std::shared_ptr& total_seq_len, + const std::shared_ptr& neg_mul) { + auto cos = makeConst(element::f32, {1, 4096, 1, 128}, MOCK_VALUE); + auto sliced_cos_by_total = makeOP({cos, {0}, total_seq_len, {1}, {1}}); + return makeOP({sliced_cos_by_total, neg_mul, {LLONG_MAX}, {1}, {1}}); + } + + static std::shared_ptr neg_mul(const std::shared_ptr& current_seq_len) { + return makeOP({current_seq_len, {-1ll}}, {numpy_broadcast}); + } + + static std::shared_ptr gen_V(const std::shared_ptr& cache, const std::shared_ptr& qkv_proj) { + auto v_current = makeOP({qkv_proj->output(2), {0, 0, 32, 128}}, {special_zero_true}); + auto v_total = makeOP({cache, v_current}, {{"axis", 1}}); + return makeOP({v_total, {0, 2, 1, 3}}); + } + + static std::shared_ptr gen_K(const std::shared_ptr& cache, const std::shared_ptr& rope_K) { + auto full_k = makeOP({cache, rope_K}, {{"axis", 1}}); + return makeOP({full_k, {0, 2, 1, 3}}); + } + + static std::shared_ptr gen_Q(const std::shared_ptr& past_seq_len_2, + const std::shared_ptr& total_seq_len_2, + const std::shared_ptr& rope_Q) { + auto _const = makeConst(element::f32, {1, 32767, 1, 1}, MOCK_VALUE); + auto slice = makeOP({_const, past_seq_len_2, total_seq_len_2, {1}, {1}}); + auto mul = makeOP({rope_Q, slice}, {numpy_broadcast}); + return makeOP({mul, {0, 2, 1, 3}}); + } + + static std::shared_ptr gen_total_seq_len_2(const std::shared_ptr& past_k_len, + const std::shared_ptr& rope_k) { + auto shape_rope_k = makeOP({rope_k}, {{"output_type", "i64"}}); + auto cur_len = makeOP({shape_rope_k, {1}, 0}, {{"batch_dims", 0}}); + return makeOP({past_k_len, cur_len}, {numpy_broadcast}); + } + + static std::shared_ptr gen_past_seq_len_2(const std::shared_ptr& total_seq_len, + const std::shared_ptr& rope_q) { + auto shape_rope_q = makeOP({rope_q}, {{"output_type", "i64"}}); + auto cur_len = makeOP({shape_rope_q, {1}, 0}, {{"batch_dims", 0}}); + return makeOP({total_seq_len, cur_len}, {numpy_broadcast}); + } + + static std::shared_ptr gen_attention_mask(const std::shared_ptr& Q_in, + const std::shared_ptr& attention_mask_in, + const std::shared_ptr& total_seq_len) { + auto _const = makeConst(element::boolean, {1, 1, 8192, 8192}, MOCK_VALUE); + auto shape_of_q = makeOP({Q_in}, {{"output_type", "i64"}}); + auto gather = makeOP({shape_of_q, {2}, 0}, {{"batch_dims", 0}}); + auto sub_1 = makeOP({total_seq_len, gather}, {numpy_broadcast}); + auto concat = makeOP({sub_1, {0ll}}, {{"axis", 0}}); + auto broadcast = makeOP({total_seq_len, {2}}, {{"mode", "numpy"}}); + auto slice = makeOP({_const, concat, broadcast, {1, 1}, {2, 3}}); + auto bitwise_not = makeOP({slice}); + + auto _const_1 = single_val(/*rank*/ 4, /*val*/ 1); + auto view_reshape = makeOP({attention_mask_in, {0, 0}}, {special_zero_true}); + auto unsqueeze_0 = makeOP({view_reshape, 1}); + auto unsqueeze_1 = makeOP({unsqueeze_0, 2}); + auto convert_0 = makeOP({unsqueeze_1}, {dest_type_f32}); + + auto _const_2 = single_val(/*rank*/ 4, /*val*/ 1); + auto mul_1 = makeOP({convert_0, _const_2}, {numpy_broadcast}); + auto sub_2 = makeOP({_const_1, mul_1}, {numpy_broadcast}); + + auto _const_3 = single_val(/*rank*/ 4, /*val*/ 1); + auto mul_2 = makeOP({sub_2, _const_3}, {numpy_broadcast}); + auto list_construct = makeOP({{1ll}, {1ll}, gather, {1ll}}, {{"axis", 0}}); + auto expand_broadcast = makeOP({mul_2, list_construct}, {{"mode", "bidirectional"}}); + return makeOP({bitwise_not, -FLT_MAX, expand_broadcast}, {numpy_broadcast}); + } +}; + +class Qwen7bChatPA { +public: + static std::shared_ptr gen_embeddings(const std::shared_ptr& input_ids) { + auto weights = makeConst(element::u8, {WEIGHTS, 4096}, MOCK_VALUE); + auto weights_fp16 = makeOP({weights}, {dest_type_f16}); + + auto zero_point = makeConst(element::u8, {WEIGHTS, 1}, MOCK_VALUE); + auto zero_point_fp16 = makeOP({zero_point}, {dest_type_f16}); + auto sub = makeOP({weights_fp16, zero_point_fp16}, {numpy_broadcast}); + + auto scale = makeConst(element::f16, {WEIGHTS, 1}, MOCK_VALUE); + auto mul = makeOP({sub, scale}, {numpy_broadcast}); + auto mul_fp32 = makeOP({mul}, {dest_type_f32}); + + auto reshape_view = makeOP({input_ids, {-1, 0}}, {special_zero_true}); + auto reshape_view_i64 = makeOP({reshape_view}, {dest_type_i64}); + return makeOP({mul_fp32, reshape_view_i64, 0}, {{"batch_dims", 0}}); + } + + static std::shared_ptr gen_qkv_proj(const std::shared_ptr& embeddings) { + auto _const_0 = makeConst(element::f32, {1, 1, 1}, MOCK_VALUE); + auto pow = makeOP({embeddings, _const_0}, {numpy_broadcast}); + auto mean = makeOP({pow, {-1}}, {{"keep_dims", true}}); + auto _const_1 = makeConst(element::f32, {1, 1, 1}, MOCK_VALUE); + auto add_0 = makeOP({mean, _const_1}, {numpy_broadcast}); + + auto sqrt = makeOP({add_0}); + auto _const_2 = makeConst(element::f32, {1, 1, 1}, MOCK_VALUE); + auto div = makeOP({_const_2, sqrt}, {numpy_broadcast, {"m_pythondiv", true}}); + auto mul_0 = makeOP({embeddings, div}, {numpy_broadcast}); + + auto _const_3 = makeConst(element::f32, {1, 1, 4096}, MOCK_VALUE); + auto mul_1 = makeOP({mul_0, _const_3}, {numpy_broadcast}); + + auto _const_4 = makeConst(element::u8, {ATTENTION_WEIGHTS, 4096}, MOCK_VALUE); + auto convert_0 = makeOP({_const_4}, {dest_type_f16}); + + auto _const_5 = makeConst(element::u8, {ATTENTION_WEIGHTS, 1}, MOCK_VALUE); + auto convert_1 = makeOP({_const_5}, {dest_type_f16}); + auto sub = makeOP({convert_0, convert_1}, {numpy_broadcast}); + + auto _const_6 = makeConst(element::f16, {ATTENTION_WEIGHTS, 1}, MOCK_VALUE); + auto mul_2 = makeOP({sub, _const_6}, {numpy_broadcast}); + auto convert_2 = makeOP({mul_2}, {dest_type_f32}); + auto matmul = makeOP({mul_1, convert_2}, {{"transpose_a", false}, {"transpose_b", true}}); + auto Constant_270 = makeConst(element::f32, {1, 1, ATTENTION_WEIGHTS}, MOCK_VALUE); + auto add_1 = makeOP({matmul, Constant_270}, {numpy_broadcast}); + + return makeOP({add_1, 2, {4096, 4096, -1}}); + } + + static std::shared_ptr gen_rope(QKV idx, + const std::shared_ptr& qkv_proj, + const std::shared_ptr& head_size, + const std::shared_ptr& sin, + const std::shared_ptr& cos) { + auto Q_or_K = makeOP({qkv_proj->output(idx), {0, 0, 32, 128}}, {special_zero_true}); + auto sliced = makeOP({Q_or_K, {0}, head_size, {1}, {3}}); + auto mul_0 = makeOP({sliced, sin}, {numpy_broadcast}); + + auto reshape = makeOP({sliced, {0, 0, 32, 2, 64}}, {special_zero_true}); + auto split = makeOP({reshape, -2}, {{"num_splits", 2}}); + auto squeeze_0 = makeOP({split->output(1), -2}); + auto _const_0 = makeConst(element::f32, {1, 1, 1, 1}, {1.000000f}); + auto mul_1 = makeOP({squeeze_0, _const_0}, {numpy_broadcast}); + + auto squeeze_1 = makeOP({split->output(0), -2}); + auto concat = makeOP({mul_1, squeeze_1}, {{"axis", -1}}); + auto mul_2 = makeOP({concat, cos}, {numpy_broadcast}); + return makeOP({mul_0, mul_2}, {numpy_broadcast}); + } + + static std::shared_ptr gen_rope_emb_sin(const std::shared_ptr& max_context_len, + const std::shared_ptr& position_ids, + std::shared_ptr& head_size) { + auto sin = makeConst(element::f32, {1, 4096, 1, 128}, MOCK_VALUE); + auto slice_sin = makeOP({sin, position_ids, 1}, {{"batch_dims", 0}}); + + auto slice = makeOP({sin, {0}, max_context_len, {1}, {1}}); + auto shape_of = makeOP({slice}, {{"output_type", "i64"}}); + head_size = makeOP({shape_of, {3}, 0}, {{"batch_dims", 0}}); + + return makeOP({slice_sin, {-1, 1, 1, 128}}, {{"special_zero", false}}); + } + + static std::shared_ptr gen_rope_emb_cos(const std::shared_ptr& max_context_len, + const std::shared_ptr& position_ids) { + auto cos = makeConst(element::f32, {1, 4096, 1, 128}, MOCK_VALUE); + auto slice = makeOP({cos, position_ids, 1}, {{"batch_dims", 0}}); + return makeOP({slice, {-1, 1, 1, 128}}, {{"special_zero", false}}); + } + + static std::shared_ptr align_pa_layout(const std::shared_ptr& pa, + const std::shared_ptr& head_size) { + auto shape = makeOP({{0ll}, {1ll}, {-1ll}, head_size}, {{"axis", 0}}); + auto reshaped = makeOP({pa->output(0), shape}, {special_zero_true}); + return makeOP({reshaped, {0, 2, 1, 3}}); + } + + static std::shared_ptr gen_current_len(const std::shared_ptr& rope_K) { + auto shape_of = makeOP({rope_K}, {{"output_type", "i32"}}); + return makeOP({shape_of, {1}, 0ll}, {{"batch_dims", 0}}); + } + + static std::shared_ptr gen_past_len(const std::shared_ptr& input_ids, + const std::shared_ptr& max_context_len) { + auto shape_of = makeOP({input_ids}, {{"output_type", "i64"}}); + auto cur_len = makeOP({shape_of, 1ll, 0ll}, {{"batch_dims", 0}}); + auto cur_len_i32 = makeOP({cur_len}, {{"destination_type", "i32"}}); + + auto past_len = makeOP({max_context_len, cur_len_i32}, {numpy_broadcast}); + auto past_len_i32 = makeOP({past_len}, {{"destination_type", "i32"}}); + return makeOP({past_len_i32, {1}}, {special_zero_true}); + } + + static std::shared_ptr gen_total_len(const std::shared_ptr& cur_len, + const std::shared_ptr& past_len) { + return makeOP({past_len, cur_len}, {numpy_broadcast}); + } + + static std::shared_ptr gen_V(const std::shared_ptr& qkv_proj, std::shared_ptr& head_size) { + auto current_V = makeOP({qkv_proj->output(2), {0, 0, 32, 128}}, {special_zero_true}); + auto gather = makeOP({{0, 2, 1, 3}, {0, 2, 1, 3}, 0ll}, {{"batch_dims", 0}}); + auto transpose = makeOP({current_V, gather}); + + auto shape_of = makeOP({transpose}, {{"output_type", "i64"}}); + auto gather_2 = makeOP({shape_of, -1ll, 0ll}, {{"batch_dims", 0}}); + head_size = makeOP({gather_2, 0}); + + return makeOP({transpose, {0, -1}}, {special_zero_true}); + } + + static std::shared_ptr gen_K(const std::shared_ptr& rope_K) { + auto gather = makeOP({{0, 2, 1, 3}, {0, 2, 1, 3}, 0ll}, {{"batch_dims", 0}}); + auto transpose = makeOP({rope_K, gather}); + return makeOP({transpose, {0, -1}}, {special_zero_true}); + } + + static std::shared_ptr gen_Q(const std::shared_ptr& total_seq_len, + const std::shared_ptr& rope_Q) { + auto _const_1 = makeConst(element::f32, {1, 32767, 1, 1}, MOCK_VALUE); + auto shape_of = makeOP({rope_Q}, {{"output_type", "i32"}}); + auto current_seq_len = makeOP({shape_of, {1}, 0ll}, {{"batch_dims", 0}}); + auto past_seq_len = makeOP({total_seq_len, current_seq_len}, {numpy_broadcast}); + + auto slice = makeOP({_const_1, past_seq_len, total_seq_len, {1}, {1}}); + auto mul = makeOP({rope_Q, slice}, {numpy_broadcast}); + auto transpose_1 = makeOP({mul, {0, 2, 1, 3}}); + + auto transpose_2 = makeOP({transpose_1, {0, 2, 1, 3}}); + return makeOP({transpose_2, {0, -1}}, {special_zero_true}); + } +}; + +} // namespace + +TEST_F(TransformationTestsF, SDPAToPA_Qwen) { + { + // Inputs to SDPA transformer: + auto beam_idx = makeOP({}, {{"shape", PartialShape{DYN}}, el_type_i64}); + auto position_ids = makeOP({}, {{"shape", PartialShape{DYN, DYN}}, el_type_i64}); + auto attention_mask = makeOP({}, {{"shape", PartialShape{DYN, DYN}}, el_type_i64}); + auto input_ids = makeOP({}, {{"shape", PartialShape{DYN, DYN}}, el_type_i64}); + ParameterVector params = nodes_to_params({position_ids, input_ids, attention_mask, beam_idx}); + + beam_idx->output(0).add_names({"beam_idx"}); + position_ids->output(0).add_names({"position_ids"}); + attention_mask->output(0).add_names({"attention_mask"}); + input_ids->output(0).add_names({"input_ids"}); + + // Embeddings processing: + auto embeddings = Qwen7bChatSDPA::gen_embeddings(input_ids); + auto qkv_proj = Qwen7bChatSDPA::gen_qkv_proj(embeddings); + + // KV cache: + auto k_cache = Qwen7bChatSDPA::gen_cache(input_ids, beam_idx, "K_cache"); + auto v_cache = Qwen7bChatSDPA::gen_cache(input_ids, beam_idx, "V_cache"); + + // Current/past/total Seq lengths calculation: + auto current_seq_len = Qwen7bChatSDPA::gen_current_len(input_ids); + auto past_seq_len = Qwen7bChatSDPA::gen_past_len(k_cache); + auto total_seq_len = Qwen7bChatSDPA::gen_total_len(current_seq_len, past_seq_len); + + // RoPE emb sin/cos init: + auto neg_cur_seq_len = Qwen7bChatSDPA::neg_mul(current_seq_len); + auto head_size = shared_ptr(); + auto rope_emb_sin = Qwen7bChatSDPA::gen_rope_emb_sin(total_seq_len, neg_cur_seq_len, head_size); + auto rope_emb_cos = Qwen7bChatSDPA::gen_rope_emb_cos(total_seq_len, neg_cur_seq_len); + + // RoPE for Q,K inputs: + auto rope_q = Qwen7bChatSDPA::gen_rope(QKV::Q, qkv_proj, head_size, rope_emb_sin, rope_emb_cos); + auto rope_k = Qwen7bChatSDPA::gen_rope(QKV::K, qkv_proj, head_size, rope_emb_sin, rope_emb_cos); + + // Lengths: + auto total_seq_len_2 = Qwen7bChatSDPA::gen_total_seq_len_2(past_seq_len, rope_k); + auto past_seq_len_2 = Qwen7bChatSDPA::gen_past_seq_len_2(total_seq_len_2, rope_q); + + // Q, K, V: + auto Q = Qwen7bChatSDPA::gen_Q(past_seq_len_2, total_seq_len_2, rope_q); + auto K = Qwen7bChatSDPA::gen_K(k_cache, rope_k); + auto V = Qwen7bChatSDPA::gen_V(v_cache, qkv_proj); + + // Attention mask: + auto attention_mask_to_sdpa = Qwen7bChatSDPA::gen_attention_mask(Q, attention_mask, total_seq_len_2); + + // SDPA: + auto sdpa = makeOP({Q, K, V, attention_mask_to_sdpa}, {{"causal", false}}); + auto res = makeOP({sdpa}); + + model = std::make_shared(OutputVector{res}, params); + manager.register_pass(); + } + + { + // Inputs to PA transformer: + auto max_context_len = makeOP({}, {{"shape", PartialShape{}}, el_type_i32}); + auto block_indices_begins = makeOP({}, {{"shape", PartialShape{DYN}}, el_type_i32}); + auto block_indices = makeOP({}, {{"shape", PartialShape{DYN}}, el_type_i32}); + auto subsequence_begins = makeOP({}, {{"shape", PartialShape{DYN}}, el_type_i32}); + auto past_lens = makeOP({}, {{"shape", PartialShape{DYN}}, el_type_i32}); + auto value_cache_0 = makeOP({}, {{"shape", PartialShape{DYN, 32, 128}}, el_type_f32}); + auto key_cache_0 = makeOP({}, {{"shape", PartialShape{DYN, 32, 128}}, el_type_f32}); + auto input_ids = makeOP({}, {{"shape", PartialShape{DYN}}, el_type_i64}); + auto position_ids = makeOP({}, {{"shape", PartialShape{DYN}}, el_type_i64}); + auto params = nodes_to_params({max_context_len, + block_indices_begins, + block_indices, + subsequence_begins, + past_lens, + value_cache_0, + key_cache_0, + input_ids, + position_ids}); + + // Inputs pre-processing: + auto max_context_len_i64 = makeOP({max_context_len}, {dest_type_i64}); + auto max_context_len_aligned = makeOP({max_context_len_i64, {1}}, {special_zero_true}); + auto input_ids_aligned = makeOP({input_ids, 1}); + auto position_ids_aligned = makeOP({position_ids, 1}); + + // Embeddings processing: + auto embeddings = Qwen7bChatPA::gen_embeddings(input_ids_aligned); + auto qkv_proj = Qwen7bChatPA::gen_qkv_proj(embeddings); + + // RoPE emb sin/cos init: + auto head_size = shared_ptr(); + auto rope_emb_sin = Qwen7bChatPA::gen_rope_emb_sin(max_context_len_aligned, position_ids_aligned, head_size); + auto rope_emb_cos = Qwen7bChatPA::gen_rope_emb_cos(max_context_len_aligned, position_ids_aligned); + + // rope Q, K: + auto rope_Q = Qwen7bChatPA::gen_rope(QKV::Q, qkv_proj, head_size, rope_emb_sin, rope_emb_cos); + auto rope_K = Qwen7bChatPA::gen_rope(QKV::K, qkv_proj, head_size, rope_emb_sin, rope_emb_cos); + + // Current/past/total Seq lengths calculation: + auto current_seq_len = Qwen7bChatPA::gen_current_len(rope_K); + auto past_seq_len = Qwen7bChatPA::gen_past_len(input_ids_aligned, max_context_len); + auto total_seq_len = Qwen7bChatPA::gen_total_len(current_seq_len, past_seq_len); + + // Q, K, V: + shared_ptr head_size_2; + auto Q = Qwen7bChatPA::gen_Q(total_seq_len, rope_Q); + auto K = Qwen7bChatPA::gen_K(rope_K); + auto V = Qwen7bChatPA::gen_V(qkv_proj, head_size_2); + + // Additional PA arguments: + auto sliding_window = std::make_shared(element::i32, Shape{}, 0); + auto alibi_slopes = std::make_shared(element::f32, Shape{0}); + auto scale = std::make_shared(element::f32, Shape{}, MOCK_VALUE); + + // PagedAttention: + auto pa = std::make_shared(OutputVector{Q, + K, + V, + key_cache_0, + value_cache_0, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + scale, + sliding_window, + alibi_slopes, + max_context_len}); + pa->set_out_type(0, element::i64); + auto pa_aligned = Qwen7bChatPA::align_pa_layout(pa, head_size_2); + auto res = makeOP({pa_aligned}); + + model_ref = std::make_shared(OutputVector{res}, params); + } + // TODO: align precisions, check the copying of "fuse_names" attr in SDPAToPagedAttention + // checking the graph structure and names, other checks are temporarily disabled: + comparator.disable(FunctionsComparator::PRECISIONS); + disable_rt_info_check(); +} + +TEST_F(TransformationTestsF, SDPAToPA_TotalSequenceLengthPatternQwen) { + { + // Inputs to SDPA transformer: + auto beam_idx = makeOP({}, {{"shape", PartialShape{DYN}}, el_type_i64}); + auto input_ids = makeOP({}, {{"shape", PartialShape{DYN, DYN}}, el_type_i64}); + ParameterVector params = nodes_to_params({input_ids, beam_idx}); + + // K cache + auto k_cache = Qwen7bChatSDPA::gen_cache(input_ids, beam_idx, "K_cache"); + + // Current/past/total Seq lengths calculation: + auto current_len = Qwen7bChatSDPA::gen_current_len(input_ids); + auto past_len = Qwen7bChatSDPA::gen_past_len(k_cache); + auto total_len = Qwen7bChatSDPA::gen_total_len(current_len, past_len); + auto result = std::make_shared(total_len); + + // Expected that these Nodes to be created inside SDPAToPagedAttention + auto new_input_ids = std::make_shared(element::i64, PartialShape{DYN}); + auto axis = v0::Constant::create(element::i32, Shape{}, {1}); + auto aligned_input_ids = std::make_shared(new_input_ids, axis); + + input_ids->output(0).replace(aligned_input_ids); + auto max_context_len = std::make_shared(element::i32, PartialShape{}); + max_context_len->output(0).set_names({"max_context_len"}); + auto position_ids = std::make_shared(element::i64, PartialShape{DYN}); + position_ids->output(0).set_names({"position_ids"}); + + params.push_back(max_context_len); + params.push_back(new_input_ids); + + // Model and Transformations: + model = std::make_shared(ResultVector{result}, params); + manager.register_pass(aligned_input_ids, max_context_len, position_ids); + manager.register_pass(max_context_len); + } + + { + // Inputs to PA transformer: + auto max_context_len = makeOP({}, {{"shape", PartialShape{}}, el_type_i32}); + auto params = nodes_to_params({max_context_len}); + + // Inputs pre-processing: + auto max_context_len_i64 = makeOP({max_context_len}, {dest_type_i64}); + auto max_context_len_aligned = makeOP({max_context_len_i64, {1}}, {special_zero_true}); + + auto result = std::make_shared(max_context_len_aligned); + model_ref = std::make_shared(ResultVector{result}, params); + } + // TODO: align precisions, check the copying of "fuse_names" attr in SDPAToPagedAttention + // checking the graph structure and names, other checks are temporarily disabled: + comparator.disable(FunctionsComparator::PRECISIONS); + disable_result_friendly_names_check(); + disable_rt_info_check(); +} diff --git a/src/core/include/openvino/op/fake_convert.hpp b/src/core/include/openvino/op/fake_convert.hpp index c3eaa43b98a51b..16ef7a0337c15b 100644 --- a/src/core/include/openvino/op/fake_convert.hpp +++ b/src/core/include/openvino/op/fake_convert.hpp @@ -68,6 +68,7 @@ class OPENVINO_API FakeConvert : public Op { bool has_evaluate() const override; std::string get_destination_type() const; + void set_destination_type(ov::element::Type destination_type); const ov::element::Type& get_destination_element_type() const; private: diff --git a/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp b/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp index 74aeacb0719cee..d52e78dbd6a489 100644 --- a/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp +++ b/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp @@ -19,7 +19,7 @@ class OPENVINO_API SDPAToPagedAttention : public ModelPass { public: OPENVINO_MODEL_PASS_RTTI("SDPAToPagedAttention"); - SDPAToPagedAttention(bool use_block_indices_inputs = false, bool use_score_outputs = false); + explicit SDPAToPagedAttention(bool use_block_indices_inputs = false, bool use_score_outputs = false); bool run_on_model(const std::shared_ptr& model) override; private: diff --git a/src/core/src/op/fake_convert.cpp b/src/core/src/op/fake_convert.cpp index 5b3c8f8d8e9938..517674402ef872 100644 --- a/src/core/src/op/fake_convert.cpp +++ b/src/core/src/op/fake_convert.cpp @@ -79,6 +79,10 @@ std::string FakeConvert::get_destination_type() const { return m_destination_type.get_type_name(); } +void FakeConvert::set_destination_type(ov::element::Type destination_type) { + m_destination_type = destination_type; +} + const ov::element::Type& FakeConvert::get_destination_element_type() const { return m_destination_type; } diff --git a/src/core/src/pass/manager.cpp b/src/core/src/pass/manager.cpp index a6f1fc287e221c..b084ec4dc38e09 100644 --- a/src/core/src/pass/manager.cpp +++ b/src/core/src/pass/manager.cpp @@ -5,6 +5,7 @@ #include "openvino/pass/manager.hpp" #include +#include #include #include #include diff --git a/src/core/src/pass/sdpa_to_paged_attention.cpp b/src/core/src/pass/sdpa_to_paged_attention.cpp index 872e4539eda8df..e6fc744bb5ef4f 100644 --- a/src/core/src/pass/sdpa_to_paged_attention.cpp +++ b/src/core/src/pass/sdpa_to_paged_attention.cpp @@ -81,15 +81,12 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptrset_partial_shape(PartialShape{-1}); + auto input_ids_target_inputs = input_ids_node->get_output_target_inputs(0); auto unsqueezed_input_ids = std::make_shared(input_ids_node, v0::Constant::create(element::i32, Shape{}, {1})); - replace_node(input_ids_node, unsqueezed_input_ids); - - auto cur_seq_len = std::make_shared(std::make_shared(unsqueezed_input_ids), - v0::Constant::create(element::i64, Shape{}, {1}), - v0::Constant::create(element::i64, Shape{}, {0})); - auto prev_max_seq_len = - std::make_shared(max_context_len, std::make_shared(cur_seq_len, element::i32)); + for (const auto& target : input_ids_target_inputs) { + target.replace_source_output(unsqueezed_input_ids); + } ParameterVector kv_parameters; ParameterVector parameters_to_remove; @@ -106,15 +103,15 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptrset_partial_shape(PartialShape{-1}); position_ids->validate_and_infer_types(); } + auto position_ids_target_inputs = position_ids->get_output_target_inputs(0); auto unsqueezed_position_ids = std::make_shared(position_ids, v0::Constant::create(element::i32, Shape{}, {1})); - replace_node(position_ids, unsqueezed_position_ids); + for (const auto& target : position_ids_target_inputs) { + target.replace_source_output(unsqueezed_position_ids); + } int layer_index = 0; - auto batch_dim = - std::make_shared(position_ids); // it is not always required, so will be disposed if not needed - ov::pass::Manager manager("SDPA to PA"); manager.set_per_pass_validation(false); manager.register_pass(kv_parameters, @@ -127,9 +124,12 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptr(prev_max_seq_len, batch_dim); + + manager.register_pass(unsqueezed_input_ids, max_context_len, position_ids); manager.register_pass(max_context_len); - manager.register_pass(unsqueezed_position_ids->output(0)); + manager.register_pass(max_context_len); + manager.register_pass(unsqueezed_position_ids); + manager.register_pass(unsqueezed_position_ids); manager.run_passes(model); { diff --git a/src/frontends/onnx/tests/__init__.py b/src/frontends/onnx/tests/__init__.py index ef8cebfa361e3f..fdf1295dfd1dbe 100644 --- a/src/frontends/onnx/tests/__init__.py +++ b/src/frontends/onnx/tests/__init__.py @@ -147,7 +147,7 @@ def xfail_test(reason="Mark the test as expected to fail", strict=True): skip_dynamic_model = pytest.mark.skip(reason="CPU plug-in can't load a model with dynamic output shapes via legacy API") # ONNX 1.14 -xfail_issue_119896 = xfail_test(reason="Unsupported element type: FLOAT8") +xfail_issue_119896 = xfail_test(reason="Unsupported element type: FLOAT8", strict=False) xfail_issue_119900 = xfail_test(reason="While validating ONNX node '': " "half_pixel_symmetric - this type of coordinate transformation mode " "is not supported. Choose one of the following modes: " diff --git a/src/inference/src/os/lin/lin_system_conf.cpp b/src/inference/src/os/lin/lin_system_conf.cpp index f8bd16173b8fce..29c8bfddbd1ca4 100644 --- a/src/inference/src/os/lin/lin_system_conf.cpp +++ b/src/inference/src/os/lin/lin_system_conf.cpp @@ -23,76 +23,107 @@ CPU::CPU() { std::vector> system_info_table; std::vector node_info_table; - auto get_cache_info_linux = [&]() { + constexpr int cache_info_mode = 1; + constexpr int freq_info_mode = 2; + + auto get_info_linux = [&](int mode) { int cpu_index = 0; - int cache_index = 0; - int cache_files = 3; + int file_index = 0; + int max_files = 3; - std::vector one_info(cache_files); + std::string one_info; - while (1) { - for (int n = 0; n < cache_files; n++) { - cache_index = (n == 0) ? n : n + 1; - - std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + "/cache/index" + - std::to_string(cache_index) + "/shared_cpu_list"); - if (!cache_file.is_open()) { - cache_index = -1; - break; - } - std::string cache_info; - std::getline(cache_file, cache_info); - one_info[n] = std::move(cache_info); - } + std::string::size_type pos = 0; + std::string::size_type endpos = 0; + std::string sub_str; - if (cache_index == -1) { - if (cpu_index == 0) { - return -1; - } else { - return 0; - } - } else { - system_info_table.push_back(one_info); - cpu_index++; - } + int core_1; + int core_2; + + system_info_table.clear(); + + std::ifstream possible_file("/sys/devices/system/cpu/possible"); + std::string possible_info; + + if (possible_file.is_open()) { + std::getline(possible_file, possible_info); + } else { + return -1; } - return 0; - }; + if ((endpos = possible_info.find('-', pos)) != std::string::npos) { + sub_str = possible_info.substr(pos, endpos - pos); + core_1 = std::stoi(sub_str); + sub_str = possible_info.substr(endpos + 1); + core_2 = std::stoi(sub_str); + system_info_table.resize(core_2 + 1, std::vector(max_files, "")); + } else { + return -1; + } - auto get_freq_info_linux = [&]() { - int cpu_index = 0; - int cache_index = 0; + std::ifstream online_file("/sys/devices/system/cpu/online"); + std::string online_info; - std::vector file_name = {"/topology/core_cpus_list", - "/topology/physical_package_id", - "/cpufreq/cpuinfo_max_freq"}; - int num_of_files = file_name.size(); - std::vector one_info(num_of_files); + if (online_file.is_open()) { + std::getline(online_file, online_info); + } else { + system_info_table.clear(); + return -1; + } while (1) { - for (int n = 0; n < num_of_files; n++) { - cache_index = n; + if ((endpos = online_info.find('-', pos)) != std::string::npos) { + sub_str = online_info.substr(pos, endpos - pos); + core_1 = std::stoi(sub_str); + sub_str = online_info.substr(endpos + 1); + core_2 = std::stoi(sub_str); - std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + file_name[n]); - if (!cache_file.is_open()) { - cache_index = -1; - break; + for (cpu_index = core_1; cpu_index <= core_2; cpu_index++) { + if (mode == cache_info_mode) { + for (int n = 0; n < max_files; n++) { + file_index = (n == 0) ? n : n + 1; + one_info.clear(); + + std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + + "/cache/index" + std::to_string(file_index) + "/shared_cpu_list"); + if (cache_file.is_open()) { + std::getline(cache_file, one_info); + } else { + if ((cpu_index == core_1) && (n == 0)) { + system_info_table.clear(); + return -1; + } + } + system_info_table[cpu_index][n] = std::move(one_info); + } + } else { + std::vector file_name = {"/topology/core_cpus_list", + "/topology/physical_package_id", + "/cpufreq/cpuinfo_max_freq"}; + + for (int n = 0; n < max_files; n++) { + one_info.clear(); + + std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + + file_name[n]); + if (cache_file.is_open()) { + std::getline(cache_file, one_info); + } else { + if ((cpu_index == core_1) && (n == 2)) { + system_info_table.clear(); + return -1; + } + } + system_info_table[cpu_index][n] = std::move(one_info); + } + } } - std::string cache_info; - std::getline(cache_file, cache_info); - one_info[n] = std::move(cache_info); } - if (cache_index == -1) { - if (cpu_index == 0) { - return -1; - } else { - return 0; - } + if ((pos = online_info.find(',', endpos)) != std::string::npos) { + pos++; } else { - system_info_table.push_back(one_info); - cpu_index++; + break; } } @@ -190,20 +221,23 @@ CPU::CPU() { } else { _processors = valid_cpu_mapping_table.size(); _cpu_mapping_table.swap(valid_cpu_mapping_table); - update_valid_processor_linux(std::move(phy_core_list), - _numa_nodes, - _cores, - _proc_type_table, - _cpu_mapping_table); + { + std::lock_guard lock{_cpu_mutex}; + update_valid_processor_linux(std::move(phy_core_list), + _numa_nodes, + _cores, + _proc_type_table, + _cpu_mapping_table); + } return 0; } }; get_node_info_linux(); - if (!get_cache_info_linux()) { + if (!get_info_linux(cache_info_mode)) { parse_cache_info_linux(system_info_table, - node_info_table, + std::move(node_info_table), _processors, _numa_nodes, _sockets, @@ -215,9 +249,9 @@ CPU::CPU() { if ((_proc_type_table.size() == 0) || ((_proc_type_table[0][MAIN_CORE_PROC] == 0) && (_proc_type_table[0][ALL_PROC] > 0) && (_proc_type_table[0][ALL_PROC] != _proc_type_table[0][EFFICIENT_CORE_PROC]))) { - if (!get_freq_info_linux()) { + if (!get_info_linux(freq_info_mode)) { parse_freq_info_linux(system_info_table, - node_info_table, + std::move(node_info_table), _processors, _numa_nodes, _sockets, @@ -471,56 +505,73 @@ void parse_cache_info_linux(const std::vector> system_i const std::vector line_value_0({0, 0, 0, 0, -1, -1}); - for (int n = 0; n < _processors; n++) { - if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) { - std::string::size_type pos = 0; - std::string::size_type endpos = 0; - std::string sub_str; - - int core_1; - int core_2; + std::vector offline_list; + int info_index = 0; - if (0 == _sockets) { - _proc_type_table.push_back(line_value_0); - } else { - _proc_type_table.push_back(_proc_type_table[0]); - _proc_type_table[0] = line_value_0; - } - - while (1) { - if ((endpos = system_info_table[n][2].find('-', pos)) != std::string::npos) { - sub_str = system_info_table[n][2].substr(pos, endpos - pos); - core_1 = std::stoi(sub_str); - sub_str = system_info_table[n][2].substr(endpos + 1); - core_2 = std::stoi(sub_str); + for (int n = 0; n < _processors; n++) { + if ((system_info_table[n][2].size() > 0) || (system_info_table[n][1].size() > 0)) { + info_index = system_info_table[n][2].size() > 0 ? 2 : 1; + if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) { + std::string::size_type pos = 0; + std::string::size_type endpos = 0; + std::string sub_str; + + int core_1; + int core_2; + + if (0 == _sockets) { + _proc_type_table.push_back(line_value_0); + } else { + _proc_type_table.push_back(_proc_type_table[0]); + _proc_type_table[0] = line_value_0; + } - for (int m = core_1; m <= core_2; m++) { - _cpu_mapping_table[m][CPU_MAP_SOCKET_ID] = _sockets; - _cpu_mapping_table[m][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[m][CPU_MAP_SOCKET_ID]; - update_proc_map_info(m); + while (1) { + if ((endpos = system_info_table[n][info_index].find('-', pos)) != std::string::npos) { + sub_str = system_info_table[n][info_index].substr(pos, endpos - pos); + core_1 = std::stoi(sub_str); + sub_str = system_info_table[n][info_index].substr(endpos + 1); + core_2 = std::stoi(sub_str); + + if ((info_index == 1) && (core_2 - core_1 == 1)) { + offline_list.push_back(n); + break; + } + for (int m = core_1; m <= core_2; m++) { + _cpu_mapping_table[m][CPU_MAP_SOCKET_ID] = _sockets; + _cpu_mapping_table[m][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[m][CPU_MAP_SOCKET_ID]; + update_proc_map_info(m); + if (_processors == 0) { + return; + }; + } + } else if (pos != std::string::npos) { + sub_str = system_info_table[n][info_index].substr(pos); + core_1 = std::stoi(sub_str); + _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = _sockets; + _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = + _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + update_proc_map_info(core_1); if (_processors == 0) { return; }; + endpos = pos; } - } else if (pos != std::string::npos) { - sub_str = system_info_table[n][2].substr(pos); - core_1 = std::stoi(sub_str); - _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = _sockets; - _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - update_proc_map_info(core_1); - if (_processors == 0) { - return; - }; - endpos = pos; - } - if ((pos = system_info_table[n][2].find(',', endpos)) != std::string::npos) { - pos++; - } else { - break; + if ((pos = system_info_table[n][2].find(',', endpos)) != std::string::npos) { + pos++; + } else { + break; + } + } + _sockets++; + if (_proc_type_table[0][ALL_PROC] == 0) { + _proc_type_table.erase(_proc_type_table.begin()); + _sockets--; } } - _sockets++; + } else { + offline_list.push_back(n); } } @@ -540,6 +591,11 @@ void parse_cache_info_linux(const std::vector> system_i _numa_nodes = node_info_table.size(); parse_node_info_linux(node_info_table, _numa_nodes, _sockets, _proc_type_table, _cpu_mapping_table); } + + for (size_t n = 0; n < offline_list.size(); n++) { + _cpu_mapping_table.erase(_cpu_mapping_table.begin() + offline_list[n] - n); + _processors--; + } }; void get_cpu_mapping_from_cores(const int _processors, @@ -615,7 +671,6 @@ void parse_freq_info_linux(const std::vector> system_in std::vector>& _cpu_mapping_table) { int freq_max = 0; bool ecore_enabled = false; - bool ht_enabled = false; _processors = system_info_table.size(); _numa_nodes = 0; @@ -625,6 +680,8 @@ void parse_freq_info_linux(const std::vector> system_in std::vector line_value_0(PROC_TYPE_TABLE_SIZE, 0); + std::vector offline_list; + auto clean_up_output = [&]() { _processors = 0; _cores = 0; @@ -636,65 +693,68 @@ void parse_freq_info_linux(const std::vector> system_in }; for (int n = 0; n < _processors; n++) { - if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) { - std::string::size_type pos = 0; - std::string::size_type endpos1 = 0; - std::string::size_type endpos2 = 0; - std::string sub_str; - - int core_1 = 0; - int core_2 = 0; - - if (((endpos1 = system_info_table[n][0].find(',', pos)) != std::string::npos) || - ((endpos2 = system_info_table[n][0].find('-', pos)) != std::string::npos)) { - endpos1 = (endpos1 != std::string::npos) ? endpos1 : endpos2; - sub_str = system_info_table[n][0].substr(pos, endpos1 - pos); - core_1 = std::stoi(sub_str); - sub_str = system_info_table[n][0].substr(endpos1 + 1); - core_2 = std::stoi(sub_str); - if ((core_1 != n) && (core_2 != n)) { - clean_up_output(); - return; - } - - _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1; - _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]); - _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores; - _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = HYPER_THREADING_PROC; - _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores; + if (system_info_table[n][2].size() > 0) { + if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) { + std::string::size_type pos = 0; + std::string::size_type endpos1 = 0; + std::string::size_type endpos2 = 0; + std::string sub_str; + + int core_1 = 0; + int core_2 = 0; + + if (((endpos1 = system_info_table[n][0].find(',', pos)) != std::string::npos) || + ((endpos2 = system_info_table[n][0].find('-', pos)) != std::string::npos)) { + endpos1 = (endpos1 != std::string::npos) ? endpos1 : endpos2; + sub_str = system_info_table[n][0].substr(pos, endpos1 - pos); + core_1 = std::stoi(sub_str); + sub_str = system_info_table[n][0].substr(endpos1 + 1); + core_2 = std::stoi(sub_str); + if ((core_1 != n) && (core_2 != n)) { + clean_up_output(); + return; + } - _cpu_mapping_table[core_2][CPU_MAP_PROCESSOR_ID] = core_2; - _cpu_mapping_table[core_2][CPU_MAP_SOCKET_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - _cpu_mapping_table[core_2][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - _cpu_mapping_table[core_2][CPU_MAP_CORE_ID] = _cpu_mapping_table[core_1][CPU_MAP_CORE_ID]; - _cpu_mapping_table[core_2][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; - _cpu_mapping_table[core_2][CPU_MAP_GROUP_ID] = _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID]; + _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1; + _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]); + _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores; + _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = HYPER_THREADING_PROC; + _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores; + + _cpu_mapping_table[core_2][CPU_MAP_PROCESSOR_ID] = core_2; + _cpu_mapping_table[core_2][CPU_MAP_SOCKET_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + _cpu_mapping_table[core_2][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + _cpu_mapping_table[core_2][CPU_MAP_CORE_ID] = _cpu_mapping_table[core_1][CPU_MAP_CORE_ID]; + _cpu_mapping_table[core_2][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; + _cpu_mapping_table[core_2][CPU_MAP_GROUP_ID] = _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID]; + + int core_freq = std::stoi(system_info_table[core_1][2]); + freq_max = std::max(core_freq, freq_max); + } else if (system_info_table[n][0].size() > 0) { + core_1 = std::stoi(system_info_table[n][0]); - ht_enabled = true; - int core_freq = std::stoi(system_info_table[core_1][2]); - freq_max = std::max(core_freq, freq_max); - } else if (system_info_table[n][0].size() > 0) { - core_1 = std::stoi(system_info_table[n][0]); + _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1; + _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]); + _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores; - _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1; - _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]); - _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores; + int core_freq = std::stoi(system_info_table[core_1][2]); + if ((0 == freq_max) || (core_freq >= freq_max * 0.97)) { + freq_max = std::max(core_freq, freq_max); + _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; + } else { + _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = EFFICIENT_CORE_PROC; + ecore_enabled = true; + } - int core_freq = std::stoi(system_info_table[core_1][2]); - if (((0 == freq_max) || (core_freq >= freq_max * 0.95)) && (!ht_enabled)) { - freq_max = std::max(core_freq, freq_max); - _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; - } else { - _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = EFFICIENT_CORE_PROC; - ecore_enabled = true; + _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores; } - - _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores; + _sockets = std::max(_sockets, _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]); + _cores++; } - _sockets = std::max(_sockets, _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]); - _cores++; + } else { + offline_list.push_back(n); } } @@ -733,6 +793,11 @@ void parse_freq_info_linux(const std::vector> system_in _numa_nodes = node_info_table.size(); parse_node_info_linux(node_info_table, _numa_nodes, _sockets, _proc_type_table, _cpu_mapping_table); } + + for (size_t n = 0; n < offline_list.size(); n++) { + _cpu_mapping_table.erase(_cpu_mapping_table.begin() + offline_list[n] - n); + _processors--; + } }; void update_valid_processor_linux(const std::vector phy_core_list, diff --git a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp index 8679090b9ae491..9ea43bd0604296 100644 --- a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp +++ b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp @@ -385,6 +385,188 @@ LinuxCpuMapTestCase cache_1sockets_96cores = { {"0-95"}, }, }; +LinuxCpuMapTestCase cache_2sockets_56cores_hyperthreading = { + 110, + 2, + 2, + 56, + {{110, 56, 0, 54, -1, -1}, {54, 28, 0, 26, 0, 0}, {56, 28, 0, 28, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {11, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {12, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {13, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {14, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {15, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {16, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {17, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {18, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {19, 0, 0, 18, HYPER_THREADING_PROC, 18, -1}, {21, 0, 0, 19, HYPER_THREADING_PROC, 19, -1}, + {22, 0, 0, 20, HYPER_THREADING_PROC, 20, -1}, {23, 0, 0, 21, HYPER_THREADING_PROC, 21, -1}, + {24, 0, 0, 22, HYPER_THREADING_PROC, 22, -1}, {25, 0, 0, 23, HYPER_THREADING_PROC, 23, -1}, + {26, 0, 0, 24, HYPER_THREADING_PROC, 24, -1}, {27, 0, 0, 25, HYPER_THREADING_PROC, 25, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 1, 1, 36, HYPER_THREADING_PROC, 36, -1}, {37, 1, 1, 37, HYPER_THREADING_PROC, 37, -1}, + {38, 1, 1, 38, HYPER_THREADING_PROC, 38, -1}, {39, 1, 1, 39, HYPER_THREADING_PROC, 39, -1}, + {40, 1, 1, 40, HYPER_THREADING_PROC, 40, -1}, {41, 1, 1, 41, HYPER_THREADING_PROC, 41, -1}, + {42, 1, 1, 42, HYPER_THREADING_PROC, 42, -1}, {43, 1, 1, 43, HYPER_THREADING_PROC, 43, -1}, + {44, 1, 1, 44, HYPER_THREADING_PROC, 44, -1}, {45, 1, 1, 45, HYPER_THREADING_PROC, 45, -1}, + {46, 1, 1, 46, HYPER_THREADING_PROC, 46, -1}, {47, 1, 1, 47, HYPER_THREADING_PROC, 47, -1}, + {48, 1, 1, 48, HYPER_THREADING_PROC, 48, -1}, {49, 1, 1, 49, HYPER_THREADING_PROC, 49, -1}, + {50, 1, 1, 50, HYPER_THREADING_PROC, 50, -1}, {51, 1, 1, 51, HYPER_THREADING_PROC, 51, -1}, + {52, 1, 1, 52, HYPER_THREADING_PROC, 52, -1}, {53, 1, 1, 53, HYPER_THREADING_PROC, 53, -1}, + {54, 1, 1, 54, HYPER_THREADING_PROC, 54, -1}, {55, 1, 1, 55, HYPER_THREADING_PROC, 55, -1}, + {56, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {57, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {58, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, {59, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {60, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, {61, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {62, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, {63, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {64, 0, 0, 8, MAIN_CORE_PROC, 8, -1}, {65, 0, 0, 9, MAIN_CORE_PROC, 9, -1}, + {66, 0, 0, 26, MAIN_CORE_PROC, 26, -1}, {67, 0, 0, 10, MAIN_CORE_PROC, 10, -1}, + {68, 0, 0, 11, MAIN_CORE_PROC, 11, -1}, {69, 0, 0, 12, MAIN_CORE_PROC, 12, -1}, + {70, 0, 0, 13, MAIN_CORE_PROC, 13, -1}, {71, 0, 0, 14, MAIN_CORE_PROC, 14, -1}, + {72, 0, 0, 15, MAIN_CORE_PROC, 15, -1}, {73, 0, 0, 16, MAIN_CORE_PROC, 16, -1}, + {74, 0, 0, 17, MAIN_CORE_PROC, 17, -1}, {75, 0, 0, 18, MAIN_CORE_PROC, 18, -1}, + {76, 0, 0, 27, MAIN_CORE_PROC, 27, -1}, {77, 0, 0, 19, MAIN_CORE_PROC, 19, -1}, + {78, 0, 0, 20, MAIN_CORE_PROC, 20, -1}, {79, 0, 0, 21, MAIN_CORE_PROC, 21, -1}, + {80, 0, 0, 22, MAIN_CORE_PROC, 22, -1}, {81, 0, 0, 23, MAIN_CORE_PROC, 23, -1}, + {82, 0, 0, 24, MAIN_CORE_PROC, 24, -1}, {83, 0, 0, 25, MAIN_CORE_PROC, 25, -1}, + {84, 1, 1, 28, MAIN_CORE_PROC, 28, -1}, {85, 1, 1, 29, MAIN_CORE_PROC, 29, -1}, + {86, 1, 1, 30, MAIN_CORE_PROC, 30, -1}, {87, 1, 1, 31, MAIN_CORE_PROC, 31, -1}, + {88, 1, 1, 32, MAIN_CORE_PROC, 32, -1}, {89, 1, 1, 33, MAIN_CORE_PROC, 33, -1}, + {90, 1, 1, 34, MAIN_CORE_PROC, 34, -1}, {91, 1, 1, 35, MAIN_CORE_PROC, 35, -1}, + {92, 1, 1, 36, MAIN_CORE_PROC, 36, -1}, {93, 1, 1, 37, MAIN_CORE_PROC, 37, -1}, + {94, 1, 1, 38, MAIN_CORE_PROC, 38, -1}, {95, 1, 1, 39, MAIN_CORE_PROC, 39, -1}, + {96, 1, 1, 40, MAIN_CORE_PROC, 40, -1}, {97, 1, 1, 41, MAIN_CORE_PROC, 41, -1}, + {98, 1, 1, 42, MAIN_CORE_PROC, 42, -1}, {99, 1, 1, 43, MAIN_CORE_PROC, 43, -1}, + {100, 1, 1, 44, MAIN_CORE_PROC, 44, -1}, {101, 1, 1, 45, MAIN_CORE_PROC, 45, -1}, + {102, 1, 1, 46, MAIN_CORE_PROC, 46, -1}, {103, 1, 1, 47, MAIN_CORE_PROC, 47, -1}, + {104, 1, 1, 48, MAIN_CORE_PROC, 48, -1}, {105, 1, 1, 49, MAIN_CORE_PROC, 49, -1}, + {106, 1, 1, 50, MAIN_CORE_PROC, 50, -1}, {107, 1, 1, 51, MAIN_CORE_PROC, 51, -1}, + {108, 1, 1, 52, MAIN_CORE_PROC, 52, -1}, {109, 1, 1, 53, MAIN_CORE_PROC, 53, -1}, + {110, 1, 1, 54, MAIN_CORE_PROC, 54, -1}, {111, 1, 1, 55, MAIN_CORE_PROC, 55, -1}, + }, + { + {"0,56", "0,56", "0-9,11-19,21-27,56-83"}, + {"1,57", "1,57", "0-9,11-19,21-27,56-83"}, + {"2,58", "2,58", "0-9,11-19,21-27,56-83"}, + {"3,59", "3,59", "0-9,11-19,21-27,56-83"}, + {"4,60", "4,60", "0-9,11-19,21-27,56-83"}, + {"5,61", "5,61", "0-9,11-19,21-27,56-83"}, + {"6,62", "6,62", "0-9,11-19,21-27,56-83"}, + {"7,63", "7,63", "0-9,11-19,21-27,56-83"}, + {"8,64", "8,64", "0-9,11-19,21-27,56-83"}, + {"9,65", "9,65", "0-9,11-19,21-27,56-83"}, + {"", "", ""}, + {"11,67", "11,67", "0-9,11-19,21-27,56-83"}, + {"12,68", "12,68", "0-9,11-19,21-27,56-83"}, + {"13,69", "13,69", "0-9,11-19,21-27,56-83"}, + {"14,70", "14,70", "0-9,11-19,21-27,56-83"}, + {"15,71", "15,71", "0-9,11-19,21-27,56-83"}, + {"16,72", "16,72", "0-9,11-19,21-27,56-83"}, + {"17,73", "17,73", "0-9,11-19,21-27,56-83"}, + {"18,74", "18,74", "0-9,11-19,21-27,56-83"}, + {"19,75", "19,75", "0-9,11-19,21-27,56-83"}, + {"", "", ""}, + {"21,77", "21,77", "0-9,11-19,21-27,56-83"}, + {"22,78", "22,78", "0-9,11-19,21-27,56-83"}, + {"23,79", "23,79", "0-9,11-19,21-27,56-83"}, + {"24,80", "24,80", "0-9,11-19,21-27,56-83"}, + {"25,81", "25,81", "0-9,11-19,21-27,56-83"}, + {"26,82", "26,82", "0-9,11-19,21-27,56-83"}, + {"27,83", "27,83", "0-9,11-19,21-27,56-83"}, + {"28,84", "28,84", "28-55,84-111"}, + {"29,85", "29,85", "28-55,84-111"}, + {"30,86", "30,86", "28-55,84-111"}, + {"31,87", "31,87", "28-55,84-111"}, + {"32,88", "32,88", "28-55,84-111"}, + {"33,89", "33,89", "28-55,84-111"}, + {"34,90", "34,90", "28-55,84-111"}, + {"35,91", "35,91", "28-55,84-111"}, + {"36,92", "36,92", "28-55,84-111"}, + {"37,93", "37,93", "28-55,84-111"}, + {"38,94", "38,94", "28-55,84-111"}, + {"39,95", "39,95", "28-55,84-111"}, + {"40,96", "40,96", "28-55,84-111"}, + {"41,97", "41,97", "28-55,84-111"}, + {"42,98", "42,98", "28-55,84-111"}, + {"43,99", "43,99", "28-55,84-111"}, + {"44,100", "44,100", "28-55,84-111"}, + {"45,101", "45,101", "28-55,84-111"}, + {"46,102", "46,102", "28-55,84-111"}, + {"47,103", "47,103", "28-55,84-111"}, + {"48,104", "48,104", "28-55,84-111"}, + {"49,105", "49,105", "28-55,84-111"}, + {"50,106", "50,106", "28-55,84-111"}, + {"51,107", "51,107", "28-55,84-111"}, + {"52,108", "52,108", "28-55,84-111"}, + {"53,109", "53,109", "28-55,84-111"}, + {"54,110", "54,110", "28-55,84-111"}, + {"55,111", "55,111", "28-55,84-111"}, + {"0,56", "0,56", "0-9,11-19,21-27,56-83"}, + {"1,57", "1,57", "0-9,11-19,21-27,56-83"}, + {"2,58", "2,58", "0-9,11-19,21-27,56-83"}, + {"3,59", "3,59", "0-9,11-19,21-27,56-83"}, + {"4,60", "4,60", "0-9,11-19,21-27,56-83"}, + {"5,61", "5,61", "0-9,11-19,21-27,56-83"}, + {"6,62", "6,62", "0-9,11-19,21-27,56-83"}, + {"7,63", "7,63", "0-9,11-19,21-27,56-83"}, + {"8,64", "8,64", "0-9,11-19,21-27,56-83"}, + {"9,65", "9,65", "0-9,11-19,21-27,56-83"}, + {"66", "66", "0-9,11-19,21-27,56-83"}, + {"11,67", "11,67", "0-9,11-19,21-27,56-83"}, + {"12,68", "12,68", "0-9,11-19,21-27,56-83"}, + {"13,69", "13,69", "0-9,11-19,21-27,56-83"}, + {"14,70", "14,70", "0-9,11-19,21-27,56-83"}, + {"15,71", "15,71", "0-9,11-19,21-27,56-83"}, + {"16,72", "16,72", "0-9,11-19,21-27,56-83"}, + {"17,73", "17,73", "0-9,11-19,21-27,56-83"}, + {"18,74", "18,74", "0-9,11-19,21-27,56-83"}, + {"19,75", "19,75", "0-9,11-19,21-27,56-83"}, + {"76", "76", "0-9,11-19,21-27,56-83"}, + {"21,77", "21,77", "0-9,11-19,21-27,56-83"}, + {"22,78", "22,78", "0-9,11-19,21-27,56-83"}, + {"23,79", "23,79", "0-9,11-19,21-27,56-83"}, + {"24,80", "24,80", "0-9,11-19,21-27,56-83"}, + {"25,81", "25,81", "0-9,11-19,21-27,56-83"}, + {"26,82", "26,82", "0-9,11-19,21-27,56-83"}, + {"27,83", "27,83", "0-9,11-19,21-27,56-83"}, + {"28,84", "28,84", "28-55,84-111"}, + {"29,85", "29,85", "28-55,84-111"}, + {"30,86", "30,86", "28-55,84-111"}, + {"31,87", "31,87", "28-55,84-111"}, + {"32,88", "32,88", "28-55,84-111"}, + {"33,89", "33,89", "28-55,84-111"}, + {"34,90", "34,90", "28-55,84-111"}, + {"35,91", "35,91", "28-55,84-111"}, + {"36,92", "36,92", "28-55,84-111"}, + {"37,93", "37,93", "28-55,84-111"}, + {"38,94", "38,94", "28-55,84-111"}, + {"39,95", "39,95", "28-55,84-111"}, + {"40,96", "40,96", "28-55,84-111"}, + {"41,97", "41,97", "28-55,84-111"}, + {"42,98", "42,98", "28-55,84-111"}, + {"43,99", "43,99", "28-55,84-111"}, + {"44,100", "44,100", "28-55,84-111"}, + {"45,101", "45,101", "28-55,84-111"}, + {"46,102", "46,102", "28-55,84-111"}, + {"47,103", "47,103", "28-55,84-111"}, + {"48,104", "48,104", "28-55,84-111"}, + {"49,105", "49,105", "28-55,84-111"}, + {"50,106", "50,106", "28-55,84-111"}, + {"51,107", "51,107", "28-55,84-111"}, + {"52,108", "52,108", "28-55,84-111"}, + {"53,109", "53,109", "28-55,84-111"}, + {"54,110", "54,110", "28-55,84-111"}, + {"55,111", "55,111", "28-55,84-111"}, + }, + { + {"0-9,11-19,21-27,56-83"}, + {"28-55,84-111"}, + }, +}; LinuxCpuMapTestCase cache_2sockets_48cores_hyperthreading = { 96, 2, @@ -1005,6 +1187,36 @@ LinuxCpuMapTestCase cache_2sockets_20cores_hyperthreading_1 = { }, {}, }; +LinuxCpuMapTestCase cache_1sockets_16cores_hyperthreading = { + 20, + 1, + 1, + 14, + {{20, 6, 8, 6, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, {3, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, + {4, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, {5, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, EFFICIENT_CORE_PROC, 6, -1}, {13, 0, 0, 7, EFFICIENT_CORE_PROC, 6, -1}, + {14, 0, 0, 8, EFFICIENT_CORE_PROC, 6, -1}, {15, 0, 0, 9, EFFICIENT_CORE_PROC, 6, -1}, + {16, 0, 0, 10, EFFICIENT_CORE_PROC, 7, -1}, {17, 0, 0, 11, EFFICIENT_CORE_PROC, 7, -1}, + {18, 0, 0, 12, EFFICIENT_CORE_PROC, 7, -1}, {19, 0, 0, 13, EFFICIENT_CORE_PROC, 7, -1}, + }, + { + {"0,5", "0,5", "0-19"}, {"1-2", "1-2", "0-19"}, {"1-2", "1-2", "0-19"}, {"3-4", "3-4", "0-19"}, + {"3-4", "3-4", "0-19"}, {"0,5", "0,5", "0-19"}, {"6-7", "6-7", "0-19"}, {"6-7", "6-7", "0-19"}, + {"8-9", "8-9", "0-19"}, {"8-9", "8-9", "0-19"}, {"10-11", "10-11", "0-19"}, {"10-11", "10-11", "0-19"}, + {"12", "12-15", "0-19"}, {"13", "12-15", "0-19"}, {"14", "12-15", "0-19"}, {"15", "12-15", "0-19"}, + {"16", "16-19", "0-19"}, {"17", "16-19", "0-19"}, {"18", "16-19", "0-19"}, {"19", "16-19", "0-19"}, + {"20", "20-21", ""}, {"21", "20-21", ""}, + }, + { + {"0-21"}, + }, +}; LinuxCpuMapTestCase cache_1sockets_14cores_hyperthreading = { 20, 1, @@ -1135,6 +1347,36 @@ LinuxCpuMapTestCase cache_1sockets_8cores_hyperthreading = { }, {{"0-11"}}, }; +LinuxCpuMapTestCase cache_1sockets_8cores_hyperthreading_1 = { + 8, + 1, + 1, + 8, + {{8, 4, 4, 0, 0, 0}}, + { + {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {2, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {3, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {4, 0, 0, 4, EFFICIENT_CORE_PROC, 4, -1}, + {5, 0, 0, 5, EFFICIENT_CORE_PROC, 4, -1}, + {6, 0, 0, 6, EFFICIENT_CORE_PROC, 4, -1}, + {7, 0, 0, 7, EFFICIENT_CORE_PROC, 4, -1}, + }, + { + {"0", "0", "0-3"}, + {"1", "1", "0-3"}, + {"2", "2", "0-3"}, + {"3", "3", "0-3"}, + {"4", "4-7", ""}, + {"5", "4-7", ""}, + {"6", "4-7", ""}, + {"7", "4-7", ""}, + }, + { + {"0-7"}, + }, +}; LinuxCpuMapTestCase cache_1sockets_6cores_hyperthreading = { 12, 1, @@ -1220,6 +1462,7 @@ INSTANTIATE_TEST_SUITE_P(CPUMap, LinuxCpuMapCacheParserTests, testing::Values(cache_2sockets_104cores_hyperthreading, cache_1sockets_96cores, + cache_2sockets_56cores_hyperthreading, cache_2sockets_48cores_hyperthreading, cache_2sockets_48cores_hyperthreading_1, cache_2sockets_24cores_hyperthreading, @@ -1229,10 +1472,12 @@ INSTANTIATE_TEST_SUITE_P(CPUMap, cache_2sockets_48cores_2, cache_2sockets_20cores_hyperthreading, cache_2sockets_20cores_hyperthreading_1, + cache_1sockets_16cores_hyperthreading, cache_1sockets_14cores_hyperthreading, cache_1sockets_14cores_hyperthreading_1, cache_1sockets_10cores_hyperthreading, cache_1sockets_8cores_hyperthreading, + cache_1sockets_8cores_hyperthreading_1, cache_1sockets_6cores_hyperthreading, cache_1sockets_4cores, cache_VM_cache_0)); diff --git a/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp b/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp index 04ab617961b953..8ccdfad011d19c 100644 --- a/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp +++ b/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp @@ -258,6 +258,188 @@ LinuxCpuMapTestCase freq_2sockets_112cores_hyperthreading = { }, // param[in]: The CPU frequency information table of this simulated platform {{"0-55,112-167"}, {"56-111,168-223"}}, // param[in]: The numa node information table of this simulated platform }; +LinuxCpuMapTestCase freq_2sockets_56cores_hyperthreading = { + 110, + 2, + 2, + 56, + {{110, 56, 0, 54, -1, -1}, {54, 28, 0, 26, 0, 0}, {56, 28, 0, 28, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {11, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {12, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {13, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {14, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {15, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {16, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {17, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {18, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {19, 0, 0, 18, HYPER_THREADING_PROC, 18, -1}, {21, 0, 0, 19, HYPER_THREADING_PROC, 19, -1}, + {22, 0, 0, 20, HYPER_THREADING_PROC, 20, -1}, {23, 0, 0, 21, HYPER_THREADING_PROC, 21, -1}, + {24, 0, 0, 22, HYPER_THREADING_PROC, 22, -1}, {25, 0, 0, 23, HYPER_THREADING_PROC, 23, -1}, + {26, 0, 0, 24, HYPER_THREADING_PROC, 24, -1}, {27, 0, 0, 25, HYPER_THREADING_PROC, 25, -1}, + {28, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {29, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {30, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {31, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {32, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {33, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {34, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {35, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {36, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {37, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {38, 1, 1, 36, HYPER_THREADING_PROC, 36, -1}, {39, 1, 1, 37, HYPER_THREADING_PROC, 37, -1}, + {40, 1, 1, 38, HYPER_THREADING_PROC, 38, -1}, {41, 1, 1, 39, HYPER_THREADING_PROC, 39, -1}, + {42, 1, 1, 40, HYPER_THREADING_PROC, 40, -1}, {43, 1, 1, 41, HYPER_THREADING_PROC, 41, -1}, + {44, 1, 1, 42, HYPER_THREADING_PROC, 42, -1}, {45, 1, 1, 43, HYPER_THREADING_PROC, 43, -1}, + {46, 1, 1, 44, HYPER_THREADING_PROC, 44, -1}, {47, 1, 1, 45, HYPER_THREADING_PROC, 45, -1}, + {48, 1, 1, 46, HYPER_THREADING_PROC, 46, -1}, {49, 1, 1, 47, HYPER_THREADING_PROC, 47, -1}, + {50, 1, 1, 48, HYPER_THREADING_PROC, 48, -1}, {51, 1, 1, 49, HYPER_THREADING_PROC, 49, -1}, + {52, 1, 1, 50, HYPER_THREADING_PROC, 50, -1}, {53, 1, 1, 51, HYPER_THREADING_PROC, 51, -1}, + {54, 1, 1, 52, HYPER_THREADING_PROC, 52, -1}, {55, 1, 1, 53, HYPER_THREADING_PROC, 53, -1}, + {56, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {57, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {58, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, {59, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {60, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, {61, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {62, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, {63, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {64, 0, 0, 8, MAIN_CORE_PROC, 8, -1}, {65, 0, 0, 9, MAIN_CORE_PROC, 9, -1}, + {66, 0, 0, 54, MAIN_CORE_PROC, 54, -1}, {67, 0, 0, 10, MAIN_CORE_PROC, 10, -1}, + {68, 0, 0, 11, MAIN_CORE_PROC, 11, -1}, {69, 0, 0, 12, MAIN_CORE_PROC, 12, -1}, + {70, 0, 0, 13, MAIN_CORE_PROC, 13, -1}, {71, 0, 0, 14, MAIN_CORE_PROC, 14, -1}, + {72, 0, 0, 15, MAIN_CORE_PROC, 15, -1}, {73, 0, 0, 16, MAIN_CORE_PROC, 16, -1}, + {74, 0, 0, 17, MAIN_CORE_PROC, 17, -1}, {75, 0, 0, 18, MAIN_CORE_PROC, 18, -1}, + {76, 0, 0, 55, MAIN_CORE_PROC, 55, -1}, {77, 0, 0, 19, MAIN_CORE_PROC, 19, -1}, + {78, 0, 0, 20, MAIN_CORE_PROC, 20, -1}, {79, 0, 0, 21, MAIN_CORE_PROC, 21, -1}, + {80, 0, 0, 22, MAIN_CORE_PROC, 22, -1}, {81, 0, 0, 23, MAIN_CORE_PROC, 23, -1}, + {82, 0, 0, 24, MAIN_CORE_PROC, 24, -1}, {83, 0, 0, 25, MAIN_CORE_PROC, 25, -1}, + {84, 1, 1, 26, MAIN_CORE_PROC, 26, -1}, {85, 1, 1, 27, MAIN_CORE_PROC, 27, -1}, + {86, 1, 1, 28, MAIN_CORE_PROC, 28, -1}, {87, 1, 1, 29, MAIN_CORE_PROC, 29, -1}, + {88, 1, 1, 30, MAIN_CORE_PROC, 30, -1}, {89, 1, 1, 31, MAIN_CORE_PROC, 31, -1}, + {90, 1, 1, 32, MAIN_CORE_PROC, 32, -1}, {91, 1, 1, 33, MAIN_CORE_PROC, 33, -1}, + {92, 1, 1, 34, MAIN_CORE_PROC, 34, -1}, {93, 1, 1, 35, MAIN_CORE_PROC, 35, -1}, + {94, 1, 1, 36, MAIN_CORE_PROC, 36, -1}, {95, 1, 1, 37, MAIN_CORE_PROC, 37, -1}, + {96, 1, 1, 38, MAIN_CORE_PROC, 38, -1}, {97, 1, 1, 39, MAIN_CORE_PROC, 39, -1}, + {98, 1, 1, 40, MAIN_CORE_PROC, 40, -1}, {99, 1, 1, 41, MAIN_CORE_PROC, 41, -1}, + {100, 1, 1, 42, MAIN_CORE_PROC, 42, -1}, {101, 1, 1, 43, MAIN_CORE_PROC, 43, -1}, + {102, 1, 1, 44, MAIN_CORE_PROC, 44, -1}, {103, 1, 1, 45, MAIN_CORE_PROC, 45, -1}, + {104, 1, 1, 46, MAIN_CORE_PROC, 46, -1}, {105, 1, 1, 47, MAIN_CORE_PROC, 47, -1}, + {106, 1, 1, 48, MAIN_CORE_PROC, 48, -1}, {107, 1, 1, 49, MAIN_CORE_PROC, 49, -1}, + {108, 1, 1, 50, MAIN_CORE_PROC, 50, -1}, {109, 1, 1, 51, MAIN_CORE_PROC, 51, -1}, + {110, 1, 1, 52, MAIN_CORE_PROC, 52, -1}, {111, 1, 1, 53, MAIN_CORE_PROC, 53, -1}, + }, + { + {"0,56", "0", "3500000"}, + {"1,57", "0", "3500000"}, + {"2,58", "0", "3500000"}, + {"3,59", "0", "3500000"}, + {"4,60", "0", "3500000"}, + {"5,61", "0", "3500000"}, + {"6,62", "0", "3500000"}, + {"7,63", "0", "3500000"}, + {"8,64", "0", "3500000"}, + {"9,65", "0", "3500000"}, + {"", "", ""}, + {"11,67", "0", "3500000"}, + {"12,68", "0", "3500000"}, + {"13,69", "0", "3500000"}, + {"14,70", "0", "3500000"}, + {"15,71", "0", "3500000"}, + {"16,72", "0", "3500000"}, + {"17,73", "0", "3500000"}, + {"18,74", "0", "3500000"}, + {"19,75", "0", "3500000"}, + {"", "", ""}, + {"21,77", "0", "3500000"}, + {"22,78", "0", "3500000"}, + {"23,79", "0", "3500000"}, + {"24,80", "0", "3500000"}, + {"25,81", "0", "3500000"}, + {"26,82", "0", "3500000"}, + {"27,83", "0", "3500000"}, + {"28,84", "1", "3500000"}, + {"29,85", "1", "3500000"}, + {"30,86", "1", "3500000"}, + {"31,87", "1", "3500000"}, + {"32,88", "1", "3500000"}, + {"33,89", "1", "3500000"}, + {"34,90", "1", "3500000"}, + {"35,91", "1", "3500000"}, + {"36,92", "1", "3500000"}, + {"37,93", "1", "3500000"}, + {"38,94", "1", "3500000"}, + {"39,95", "1", "3500000"}, + {"40,96", "1", "3500000"}, + {"41,97", "1", "3500000"}, + {"42,98", "1", "3500000"}, + {"43,99", "1", "3500000"}, + {"44,100", "1", "3500000"}, + {"45,101", "1", "3500000"}, + {"46,102", "1", "3500000"}, + {"47,103", "1", "3500000"}, + {"48,104", "1", "3500000"}, + {"49,105", "1", "3500000"}, + {"50,106", "1", "3500000"}, + {"51,107", "1", "3500000"}, + {"52,108", "1", "3500000"}, + {"53,109", "1", "3500000"}, + {"54,110", "1", "3500000"}, + {"55,111", "1", "3500000"}, + {"0,56", "0", "3500000"}, + {"1,57", "0", "3500000"}, + {"2,58", "0", "3500000"}, + {"3,59", "0", "3500000"}, + {"4,60", "0", "3500000"}, + {"5,61", "0", "3500000"}, + {"6,62", "0", "3500000"}, + {"7,63", "0", "3500000"}, + {"8,64", "0", "3500000"}, + {"9,65", "0", "3500000"}, + {"66", "0", "3500000"}, + {"11,67", "0", "3500000"}, + {"12,68", "0", "3500000"}, + {"13,69", "0", "3500000"}, + {"14,70", "0", "3500000"}, + {"15,71", "0", "3500000"}, + {"16,72", "0", "3500000"}, + {"17,73", "0", "3500000"}, + {"18,74", "0", "3500000"}, + {"19,75", "0", "3500000"}, + {"76", "0", "3500000"}, + {"21,77", "0", "3500000"}, + {"22,78", "0", "3500000"}, + {"23,79", "0", "3500000"}, + {"24,80", "0", "3500000"}, + {"25,81", "0", "3500000"}, + {"26,82", "0", "3500000"}, + {"27,83", "0", "3500000"}, + {"28,84", "1", "3500000"}, + {"29,85", "1", "3500000"}, + {"30,86", "1", "3500000"}, + {"31,87", "1", "3500000"}, + {"32,88", "1", "3500000"}, + {"33,89", "1", "3500000"}, + {"34,90", "1", "3500000"}, + {"35,91", "1", "3500000"}, + {"36,92", "1", "3500000"}, + {"37,93", "1", "3500000"}, + {"38,94", "1", "3500000"}, + {"39,95", "1", "3500000"}, + {"40,96", "1", "3500000"}, + {"41,97", "1", "3500000"}, + {"42,98", "1", "3500000"}, + {"43,99", "1", "3500000"}, + {"44,100", "1", "3500000"}, + {"45,101", "1", "3500000"}, + {"46,102", "1", "3500000"}, + {"47,103", "1", "3500000"}, + {"48,104", "1", "3500000"}, + {"49,105", "1", "3500000"}, + {"50,106", "1", "3500000"}, + {"51,107", "1", "3500000"}, + {"52,108", "1", "3500000"}, + {"53,109", "1", "3500000"}, + {"54,110", "1", "3500000"}, + {"55,111", "1", "3500000"}, + }, + { + {"0-9,11-19,21-27,56-83"}, + {"28-55,84-111"}, + }, +}; LinuxCpuMapTestCase freq_2sockets_48cores_hyperthreading = { 96, 2, @@ -987,6 +1169,7 @@ TEST_P(LinuxCpuMapFreqParserTests, LinuxFreq) {} INSTANTIATE_TEST_SUITE_P(CPUMap, LinuxCpuMapFreqParserTests, testing::Values(freq_2sockets_112cores_hyperthreading, + freq_2sockets_56cores_hyperthreading, freq_2sockets_48cores_hyperthreading, freq_2sockets_48cores_hyperthreading_1, freq_2sockets_24cores_hyperthreading, diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index 67c538bd78341a..865ec1f692b762 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -144,6 +144,7 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"Loop", Type::TensorIterator}, {"ReadValue", Type::MemoryInput}, // for construction from name ctor, arbitrary name is used {"Assign", Type::MemoryOutput}, // for construction from layer ctor + {"ReadValueWithSubgraph", Type::MemoryInput}, {"Convert", Type::Convert}, {"NV12toRGB", Type::ColorConvert}, {"NV12toBGR", Type::ColorConvert}, diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp index 457f8368f734dd..1c5598b6d55e26 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp @@ -36,6 +36,8 @@ uint8_t DnnlExtensionUtils::sizeOfDataType(dnnl::memory::data_type dataType) { case dnnl::memory::data_type::s4: case dnnl::memory::data_type::u4: case dnnl::memory::data_type::f8_e8m0: + case dnnl::memory::data_type::f8_e4m3: + case dnnl::memory::data_type::f8_e5m2: case dnnl::memory::data_type::f4_e2m1: return 1; case dnnl::memory::data_type::undef: @@ -70,6 +72,10 @@ dnnl::memory::data_type DnnlExtensionUtils::ElementTypeToDataType(const ov::elem return memory::data_type::u4; case ov::element::f8e8m0: return memory::data_type::f8_e8m0; + case ov::element::f8e4m3: + return memory::data_type::f8_e4m3; + case ov::element::f8e5m2: + return memory::data_type::f8_e5m2; case ov::element::f4e2m1: return memory::data_type::f4_e2m1; case ov::element::undefined: @@ -106,6 +112,10 @@ ov::element::Type DnnlExtensionUtils::DataTypeToElementType(const dnnl::memory:: return ov::element::u4; case memory::data_type::f8_e8m0: return ov::element::f8e8m0; + case memory::data_type::f8_e4m3: + return ov::element::f8e4m3; + case memory::data_type::f8_e5m2: + return ov::element::f8e5m2; case memory::data_type::f4_e2m1: return ov::element::f4e2m1; case memory::data_type::undef: diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp index 2bfbaa68880aa8..6ad7d758b9ff07 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp @@ -11,13 +11,14 @@ namespace intel_cpu { class jit_uni_vcvtneps2bf16 : public jit_emitter { public: + enum class conversion_mode { default_mode, saturation_mode }; jit_uni_vcvtneps2bf16(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::bf16) + ov::element::Type exec_prc = ov::element::bf16, + conversion_mode mode = conversion_mode::default_mode) : jit_emitter(host, host_isa, exec_prc) { - if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16) && - !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) - prepare_table(); + prepare_table(); + mode_ = mode; } size_t get_inputs_num() const override { @@ -25,6 +26,7 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { } private: + conversion_mode mode_ = conversion_mode::default_mode; void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override { if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); @@ -44,6 +46,25 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { conditional3::type; Vmm in = Vmm(in_vec_idxs[0]); + if (mode_ == conversion_mode::saturation_mode) { + Vmm vmm_temp = Vmm(out_vec_idxs[0]); + + h->uni_vmaxps(vmm_temp, in, table_val("bf16_min")); + h->uni_vminps(vmm_temp, vmm_temp, table_val("bf16_max")); + + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) { + h->vfixupimmps(vmm_temp, in, table_val("selector"), 0); + } else { + Vmm mask = Vmm(aux_vec_idxs[0]); + h->uni_vcmpps(mask, in, in, 0x03); // _CMP_UNORD_Q + h->uni_vblendvps(vmm_temp, vmm_temp, table_val("nan"), mask); + h->uni_vcmpps(mask, in, table_val("inf"), 0x00); // _CMP_EQ_OQ + h->uni_vblendvps(vmm_temp, vmm_temp, table_val("inf"), mask); + h->uni_vcmpps(mask, in, table_val("neg_inf"), 0x00); // _CMP_EQ_OQ + h->uni_vblendvps(vmm_temp, vmm_temp, table_val("neg_inf"), mask); + } + h->uni_vmovups(in, vmm_temp); + } if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) { Ymm out = Ymm(out_vec_idxs[0]); @@ -119,6 +140,11 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { push_arg_entry_of("rounding", 0x00010000, true); push_arg_entry_of("selector", selector_int32, true); push_arg_entry_of("mask_truncation_word", 0x0000ffff, true); + push_arg_entry_of("bf16_max", 0x7F7F0000, true); + push_arg_entry_of("bf16_min", 0xFF7F0000, true); + push_arg_entry_of("nan", 0x7FC00000, true); + push_arg_entry_of("inf", 0x7F800000, true); + push_arg_entry_of("neg_inf", 0xFF800000, true); } size_t aux_vecs_count() const override { diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index bdb5211009a22a..95de3720bb1e25 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -23,6 +23,7 @@ #include "transformations/cpu_opset/common/op/leaky_relu.hpp" #include "transformations/cpu_opset/common/op/ngram.hpp" #include "transformations/cpu_opset/common/op/power_static.hpp" +#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp" #include "transformations/cpu_opset/common/op/sdpa.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include "transformations/cpu_opset/x64/op/interaction.hpp" @@ -78,6 +79,7 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::intel_cpu::SwishNode) \ OP_EXTENSION(ov::intel_cpu::SDPAWithTransposeReshape) \ OP_EXTENSION(ov::intel_cpu::NgramNode) \ + OP_EXTENSION(ov::intel_cpu::ReadValueWithSubgraph) \ OP_EXTENSION(ov::op::internal::GatherCompressed) \ OP_EXTENSION(ov::op::internal::NonMaxSuppressionIEInternal) \ OP_EXTENSION(ov::op::internal::MulticlassNmsIEInternal) \ diff --git a/src/plugins/intel_cpu/src/graph_dumper.cpp b/src/plugins/intel_cpu/src/graph_dumper.cpp index ffd58fdb162899..3cdd2f389d29f8 100644 --- a/src/plugins/intel_cpu/src/graph_dumper.cpp +++ b/src/plugins/intel_cpu/src/graph_dumper.cpp @@ -357,6 +357,10 @@ void average_counters(const Graph& graph) { * - _.csv * For example: 0_MyModel.csv */ + if (!graph.getGraphContext()) { + DEBUG_LOG("graph.m_context is null. Don't dump average_counters."); + return; + } const std::string& path = graph.getConfig().debugCaps.averageCountersPath; diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index fe0df309dc32f1..1cab7ab7d8c60a 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -2935,12 +2935,19 @@ void GraphOptimizer::MatchSdpaKvCache(Graph& graph) { auto memInputNode = std::dynamic_pointer_cast(node); OPENVINO_ASSERT(memInputNode, "MemoryInput node ", node->getName(), " has unexpected dynamic type"); - ov::optional input_shape; - ov::optional input_prc; - + ov::optional> inputShapes; + ov::optional> inputPrcs; if (!node->getParentEdges().empty()) { - input_shape = ov::optional(node->getInputShapeAtPort(0)); - input_prc = ov::optional(node->getOriginalInputPrecisionAtPort(0)); + inputShapes = ov::optional>(std::vector{}); + inputPrcs = ov::optional>(std::vector{}); + + auto& input_shape_vec = *inputShapes; + auto& input_prc_vec = *inputPrcs; + + for (size_t i = 0; i < node->getParentEdges().size(); i++) { + input_shape_vec.push_back(node->getInputShapeAtPort(i)); + input_prc_vec.push_back(node->getOriginalInputPrecisionAtPort(i)); + } } // search for SDPA @@ -2966,8 +2973,8 @@ void GraphOptimizer::MatchSdpaKvCache(Graph& graph) { memInputNode->getOutputShapeAtPort(0), memInputNode->getOriginalOutputPrecisionAtPort(0), graph.getGraphContext(), - input_shape, - input_prc, + inputShapes, + inputPrcs, sdpa); if (!memInputNode->getParentEdges().empty()) { @@ -3064,12 +3071,18 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph& graph) { auto memInputNode = std::dynamic_pointer_cast(node); OPENVINO_ASSERT(memInputNode, "MemoryInput node ", node->getName(), " has unexpected dynamic type"); - ov::optional inputShape; - ov::optional inputPrc; - + ov::optional> inputShapes; + ov::optional> inputPrcs; if (!node->getParentEdges().empty()) { - inputShape = ov::optional(node->getInputShapeAtPort(0)); - inputPrc = ov::optional(node->getOriginalInputPrecisionAtPort(0)); + inputShapes = ov::optional>(std::vector{}); + inputPrcs = ov::optional>(std::vector{}); + + auto& input_shape_vec = *inputShapes; + auto& input_prc_vec = *inputPrcs; + for (size_t i = 0; i < node->getParentEdges().size(); i++) { + input_shape_vec.push_back(node->getInputShapeAtPort(i)); + input_prc_vec.push_back(node->getOriginalInputPrecisionAtPort(i)); + } } // search for the MemoryOutputNode @@ -3086,6 +3099,10 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph& graph) { graph.RemoveEdge(memoryOutputNode->getParentEdgeAt(0)); // there are no output edges from MemoryOutput nodes + CPU_GRAPH_OPTIMIZER_SCOPE(DropRedundantMemoryOutput_SubGraph); + auto memInpNd = std::dynamic_pointer_cast(node); + OPENVINO_ASSERT(memInpNd, "MemoryInput node ", node->getName(), " has unexpected dynamic type"); + // now replace the existing MemoryInput with a special type that works without the corresponding MemoryOutput auto memInputSingle = std::make_shared(memInputNode->getId(), memInputNode->getName(), @@ -3093,17 +3110,24 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph& graph) { memInputNode->getOutputShapeAtPort(0), memInputNode->getOriginalOutputPrecisionAtPort(0), graph.getGraphContext(), - inputShape, - inputPrc); - + inputShapes, + inputPrcs, + memInpNd->getSubGraph()); graph.AddNode(memInputSingle); if (!memInputNode->getParentEdges().empty()) { - auto parentEdge = memInputNode->getParentEdgeAt(0); - auto parent = parentEdge->getParent(); - const auto inputNum = parentEdge->getInputNum(); - graph.RemoveEdge(parentEdge); - graph.CreateEdge(parent, memInputSingle, inputNum, 0); + auto parentEdgeNum = memInputNode->getParentEdges().size(); + std::vector parentEdges; + for (size_t i = 0; i < parentEdgeNum; i++) { + auto parentEdge = memInputNode->getParentEdgeAt(i); + auto parent = parentEdge->getParent(); + const auto inputNum = parentEdge->getInputNum(); + parentEdges.push_back(parentEdge); + graph.CreateEdge(parent, memInputSingle, inputNum, parentEdge->getOutputNum()); + } + for (auto parentEdge : parentEdges) { + graph.RemoveEdge(parentEdge); + } } for (auto&& edge : memInputNode->getChildEdgesAtPort(0)) { diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp index 0c8cddd905dc2e..f6aabe376d6eec 100644 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp @@ -9,6 +9,7 @@ #include "utils/bfloat16.hpp" #if defined(OPENVINO_ARCH_X86_64) +# include "cpu/x64/jit_avx512_core_fp8cvt.hpp" # include "nodes/kernels/x64/jit_kernel.hpp" #else # include "cpu_memory.h" @@ -27,6 +28,18 @@ using namespace dnnl::impl::utils; using namespace dnnl::impl::cpu::x64; using namespace Xbyak; +enum f8_type { none, f8e4m3, f8e5m2 }; + +template +f8_type get_f8_type() { + if (std::is_same::value || std::is_same::value) { + return f8_type::f8e4m3; + } else if (std::is_same::value || std::is_same::value) { + return f8_type::f8e5m2; + } + return f8_type::none; +} + template void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst); @@ -50,12 +63,14 @@ void convert_vec(jit_generator& gen, const RegExp& src, cons gen.movdqu(gen.xword[dst], f16vec); } +template class jit_convert_array : public jit_kernel { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_convert_array) void generate() override { - constexpr size_t vlen = 8u; - constexpr size_t vlen_log2 = 3; + bool is_fp8 = f8_e4m3_emu_ || f8_e5m2_emu_; + size_t vlen = is_fp8 ? 16u : 8u; + size_t vlen_log2 = is_fp8 ? 4 : 3; preamble(); @@ -84,17 +99,24 @@ class jit_convert_array : public jit_kernel { auto tail_size = var(); tail_size = size; - tail_size <<= static_cast(std::logb(_src_size)) - 1; - copy(tmp.pointer(), src, tail_size); + tail_size <<= static_cast(std::logb(_src_size)); + copy(tmp.pointer(), src, tail_size); _convert_vec(*this, tmp.pointer(), tmp.pointer()); tail_size = size; - tail_size <<= static_cast(std::logb(_dst_size)) - 1; - copy(dst, tmp.pointer(), tail_size); + tail_size <<= static_cast(std::logb(_dst_size)); + copy(dst, tmp.pointer(), tail_size); }); postamble(); + + if (f8_e4m3_emu_) + f8_e4m3_emu_->prepare_table(); + if (f8_e5m2_emu_) + f8_e5m2_emu_->prepare_table(); + if (uni_vcvtneps2bf16_) + uni_vcvtneps2bf16_->emit_data(); } public: @@ -108,16 +130,37 @@ class jit_convert_array : public jit_kernel { typedef void (*convert_vec_t)(jit_generator&, const RegExp&, const RegExp&); - jit_convert_array(convert_vec_t convert_vec, size_t src_size, size_t dst_size) + jit_convert_array(convert_vec_t convert_vec) : jit_kernel(jit_name()), _convert_vec(convert_vec), - _src_size(src_size), - _dst_size(dst_size) {} + _src_size(sizeof(src_t)), + _dst_size(sizeof(dst_t)) { + const auto type = get_f8_type(); + if (type == f8_type::f8e4m3) { + f8_e4m3_emu_ = std::make_shared(this, + fp8_emu_reserv_1_, + fp8_emu_reserv_2_, + fp8_emu_reserv_3_, + fp8_emu_reserv_4_, + fp8_emu_reserv_5_, + fp8_emu_scratch_); + } else if (type == f8_type::f8e5m2) { + f8_e5m2_emu_ = std::make_shared(this, + fp8_emu_reserv_1_, + fp8_emu_reserv_2_, + fp8_emu_reserv_3_, + fp8_emu_kmask_aux_, + fp8_emu_scratch_); + } + const bool is_dst_bf16 = std::is_same::value; + if (is_dst_bf16 && mayiuse(cpu_isa_t::avx512_core)) { + uni_vcvtneps2bf16_ = std::make_shared(this, cpu_isa_t::avx512_core); + } + } - template static fn_t get() { if (mayiuse(cpu_isa_t::avx2) && dnnl::impl::cpu::x64::cpu().has(Xbyak::util::Cpu::tF16C)) { - static jit_convert_array converter(convert_vec, sizeof(src_t), sizeof(dst_t)); + static jit_convert_array converter(convert_vec); auto& generator = static_cast(converter); generator.create_kernel(); return (fn_t)generator.jit_ker(); @@ -125,16 +168,192 @@ class jit_convert_array : public jit_kernel { return nullptr; } + std::shared_ptr get_f8_e4m3_emu() const { + return f8_e4m3_emu_; + } + + std::shared_ptr get_f8_e5m2_emu() const { + return f8_e5m2_emu_; + } + + std::shared_ptr get_uni_vcvtneps2bf16() const { + return uni_vcvtneps2bf16_; + } + private: convert_vec_t _convert_vec; size_t _src_size; size_t _dst_size; + + std::shared_ptr f8_e4m3_emu_; + std::shared_ptr f8_e5m2_emu_; + std::shared_ptr uni_vcvtneps2bf16_; + + const Reg64 fp8_emu_scratch_ = rax; + const Zmm fp8_emu_reserv_1_ = Zmm(9); + const Zmm fp8_emu_reserv_2_ = Zmm(10); + const Zmm fp8_emu_reserv_3_ = Zmm(11); + const Zmm fp8_emu_reserv_4_ = Zmm(12); + const Zmm fp8_emu_reserv_5_ = Zmm(13); + const Opmask fp8_emu_kmask_aux_ = Opmask(1); }; +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovups(f32vec, gen.zword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f32_to_f8(f8vec, f32vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f8_to_f32(f32vec, f8vec); + gen.vmovups(gen.zword[dst], f32vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f16vec, gen.yword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f16_to_f8(f8vec, f16vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f8_to_f16(f16vec, f8vec); + gen.vmovdqu(gen.yword[dst], f16vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vpmovzxwd(f16vec, gen.yword[src]); + gen.vpslld(f16vec, f16vec, 16); + cvt.get_f8_e4m3_emu()->vcvt_f32_to_f8(f8vec, f16vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f8_to_f32(f32vec, f8vec); + cvt.get_uni_vcvtneps2bf16()->emit_code({static_cast(f32vec.getIdx())}, + {static_cast(f16vec.getIdx())}); + gen.vmovdqu(gen.yword[dst], f16vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovups(f32vec, gen.zword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f32_to_f8(f8vec, f32vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f8_to_f32(f32vec, f8vec); + gen.vmovups(gen.zword[dst], f32vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f16vec, gen.yword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f16_to_f8(f8vec, f16vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f8_to_f16(f16vec, f8vec); + gen.vmovdqu(gen.yword[dst], f16vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vpmovzxwd(f16vec, gen.yword[src]); + gen.vpslld(f16vec, f16vec, 16); + cvt.get_f8_e5m2_emu()->vcvt_f32_to_f8(f8vec, f16vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f8_to_f32(f32vec, f8vec); + cvt.get_uni_vcvtneps2bf16()->emit_code({static_cast(f32vec.getIdx())}, + {static_cast(f16vec.getIdx())}); + gen.vmovdqu(gen.yword[dst], f16vec); +} + template void jit_convert(const TI* arg, TO* out, size_t count) { - using jit_impl = jit_convert_array; - static auto converter = jit_impl::get(); + using jit_impl = jit_convert_array; + static auto converter = jit_impl::get(); if (converter) { typename jit_impl::args_t args = {arg, out, count}; @@ -185,6 +404,12 @@ const std::tuple& Range::fit(const ov::element::Type& prec) { if (prec.is_real()) { double lbound, ubound; switch (prec) { + case ov::element::f8e4m3: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + case ov::element::f8e5m2: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); case ov::element::bf16: lbound = static_cast(std::numeric_limits::lowest()); ubound = static_cast(std::numeric_limits::max()); @@ -293,6 +518,18 @@ struct ConvertPrecision> { src_t lbound, ubound; std::tie(lbound, ubound) = ctx.range(); + // Align with the behavior of ngraph ref and jit implementation. Conversion from f8e4m3-inf + // to float should output float-inf instead of f8e4m3-max. Proper handling of special values + // (nan, inf, overflow) has already been assured by the conversion process. + if (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value) { + parallel_for(ctx.size, [&](size_t i) { + dst[i] = static_cast(src[i]); + }); + ctx.converted = true; + return; + } + if (std::is_integral::value || ctx.interimPrc.is_real() || std::is_integral::value) { parallel_for(ctx.size, [&](size_t i) { dst[i] = static_cast(std::max(std::min(src[i], ubound), lbound)); @@ -492,6 +729,12 @@ struct ConvertPrecision> { PrecisionInfo::value_type, \ PrecisionInfo::value_type) +#define INTEL_CPU_CVT_FP8_LIST \ + INTEL_CPU_CVT(f32, f8e4m3), INTEL_CPU_CVT(f16, f8e4m3), INTEL_CPU_CVT(bf16, f8e4m3), INTEL_CPU_CVT(f8e4m3, f32), \ + INTEL_CPU_CVT(f8e4m3, f16), INTEL_CPU_CVT(f8e4m3, bf16), INTEL_CPU_CVT(f32, f8e5m2), \ + INTEL_CPU_CVT(f16, f8e5m2), INTEL_CPU_CVT(bf16, f8e5m2), INTEL_CPU_CVT(f8e5m2, f32), \ + INTEL_CPU_CVT(f8e5m2, f16), INTEL_CPU_CVT(f8e5m2, bf16) + #define INTEL_CPU_CVT_LIST \ INTEL_CPU_CVT(u8, i8), INTEL_CPU_CVT(u8, u16), INTEL_CPU_CVT(u8, i16), INTEL_CPU_CVT(u8, u32), \ INTEL_CPU_CVT(u8, i32), INTEL_CPU_CVT(u8, u64), INTEL_CPU_CVT(u8, i64), INTEL_CPU_CVT(u8, f32), \ @@ -535,7 +778,8 @@ struct ConvertPrecision> { INTEL_CPU_CVT(boolean, f16), INTEL_CPU_CVT(boolean, bf16), INTEL_CPU_CVT(boolean, f64), INTEL_CPU_CVT(u8, u8), \ INTEL_CPU_CVT(i8, i8), INTEL_CPU_CVT(u16, u16), INTEL_CPU_CVT(i16, i16), INTEL_CPU_CVT(u32, u32), \ INTEL_CPU_CVT(i32, i32), INTEL_CPU_CVT(u64, u64), INTEL_CPU_CVT(i64, i64), INTEL_CPU_CVT(f32, f32), \ - INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean) + INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean), \ + INTEL_CPU_CVT_FP8_LIST #define INTEL_CPU_CVT_FROM_BIN_LIST \ INTEL_CPU_CVT(u1, f32), INTEL_CPU_CVT(u1, f16), INTEL_CPU_CVT(u1, bf16), INTEL_CPU_CVT(u1, f64), \ @@ -667,6 +911,35 @@ struct ConvertFromByteFPPrecision> { } }; +#if defined(OPENVINO_ARCH_X86_64) +struct ConvertFP8Context { + const void* srcPtr; + void* dstPtr; + size_t size; + bool converted; +}; + +template +struct ConvertFP8Precision; + +template +struct ConvertFP8Precision> { + void operator()(ConvertFP8Context& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); + constexpr size_t batch = 64; + const size_t iterations = ov::intel_cpu::div_up(ctx.size, batch); + parallel_for(iterations, [&](size_t i) { + const size_t offset = i * batch; + const size_t current_batch_size = std::min(ctx.size - offset, batch); + jit_convert(src + offset, dst + offset, current_batch_size); + }); + + ctx.converted = true; + } +}; +#endif + void cpu_convert(const void* srcPtr, void* dstPtr, ov::element::Type srcPrc, @@ -728,7 +1001,7 @@ void cpu_convert(const void* srcPtr, OV_SWITCH(intel_cpu, ConvertFrom4BitPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_4BIT_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); - } else if (srcPrc.bitwidth() == 8u && srcPrc.is_real()) { + } else if (srcPrc == ov::element::f8e8m0) { ConvertFromByteFPContext ctx{srcPrc, srcPtr, dstPtr, size, false}; OV_SWITCH(intel_cpu, ConvertFromByteFPPrecision, @@ -737,6 +1010,15 @@ void cpu_convert(const void* srcPtr, INTEL_CPU_CVT_FROM_BYTE_FP_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); +#if defined(OPENVINO_ARCH_X86_64) + } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_fp16) && + (one_of(srcPrc, ov::element::f8e4m3, ov::element::f8e5m2) || + one_of(dstPrc, ov::element::f8e4m3, ov::element::f8e5m2))) { + ConvertFP8Context ctx{srcPtr, dstPtr, size, false}; + OV_SWITCH(intel_cpu, ConvertFP8Precision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FP8_LIST); + if (!ctx.converted) + OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); +#endif } else { ConvertContext ctx{srcPtr, dstPtr, size, interimPrc, dstPrc, false}; OV_SWITCH(intel_cpu, ConvertPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_LIST); diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 5daefa01eddfab..c2e770db84695b 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -341,8 +341,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener reg_d_bias)); } - if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) - uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(this, isa)); + if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) { + auto const mode = jep_.do_output_saturation ? jit_uni_vcvtneps2bf16::conversion_mode::saturation_mode + : jit_uni_vcvtneps2bf16::conversion_mode::default_mode; + uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(this, isa, element::bf16, mode)); + } const auto& jep = jep_; @@ -478,7 +481,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener apply_post_ops(true, jep_.oc_size > 1 ? j * sizeof(float) : 0); - store_scalar(ptr[reg_dst + j * jep.dst_prc.size()], xmm_dst, exec_prc, jep.dst_prc); + store_scalar(ptr[reg_dst + j * jep.dst_prc.size()], + xmm_dst, + exec_prc, + jep.dst_prc, + jep.do_output_saturation); } for (size_t i = 0; i < jep.inputs_number; i++) @@ -546,7 +553,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener apply_post_ops(true); - store_scalar(ptr[reg_dst], xmm_dst, exec_prc, jep.dst_prc); + store_scalar(ptr[reg_dst], xmm_dst, exec_prc, jep.dst_prc, jep.do_output_saturation); for (size_t i = 0; i < jep.inputs_number; i++) if (jep.src_size[i] != 1) @@ -1012,7 +1019,8 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener inline void store_scalar(const Xbyak::Address& op, Xmm xmm_dst, ov::element::Type src_prc, - ov::element::Type dst_prc) { + ov::element::Type dst_prc, + const bool do_output_saturation) { if (src_prc == dst_prc) { switch (src_prc.size()) { case 4: @@ -1047,7 +1055,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener uni_vmovss(op, xmm_dst); break; case ov::element::bf16: - uni_vpsrld(xmm_dst, xmm_dst, 16); + if (do_output_saturation) + uni_vpsrld(xmm_dst, xmm_dst, 16); + else + uni_vcvtneps2bf16->emit_code({static_cast(xmm_dst.getIdx())}, + {static_cast(xmm_dst.getIdx())}); uni_vpextrw(op, xmm_dst, 0x0); break; case ov::element::f16: @@ -1355,6 +1367,7 @@ struct EltwiseKey { ov::element::Type outPrc; dnnl::post_ops postOps; EltwiseImplType implType; + bool doOutputSaturation; size_t hash() const { using namespace dnnl::impl; @@ -1390,6 +1403,10 @@ struct EltwiseKey { seed = hash_combine(seed, outPrc.hash()); seed = get_post_op_hash(seed, *postOps.get()); seed = hash_combine(seed, implType); + + if (outPrc == ov::element::bf16) { + seed = hash_combine(seed, doOutputSaturation); + } return seed; } @@ -1416,6 +1433,8 @@ struct EltwiseKey { result = result && (inpDims[i] == rhs.inpDims[i]); } } + if (doOutputSaturation != rhs.doOutputSaturation) + return false; } return result; @@ -1448,7 +1467,8 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { const std::vector& inpPrc, const ov::element::Type& outPrc, const dnnl::post_ops& post_ops, - bool useRuntimePtrs) { + bool useRuntimePtrs, + bool doOutputSaturation) { auto collapseLastDims = [](std::vector& dims, int dimsToCollapse) { for (size_t i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { dims[dims.size() - 1] *= dims[i]; @@ -1639,6 +1659,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { jep.dst_prc = outPrc; jep.work_amount = jep.dst_size = jep.dims.back(); jep.oc_size = oc_size; + jep.do_output_saturation = doOutputSaturation; std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(), [](size_t& offset) { return offset * sizeof(float); @@ -2160,7 +2181,8 @@ static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) { key.inpPrc, key.outPrc, key.postOps, - key.implType == EltwiseImplType::optimizedShapeAgnostic); + key.implType == EltwiseImplType::optimizedShapeAgnostic, + key.doOutputSaturation); } bool Eltwise::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { @@ -2862,6 +2884,18 @@ void Eltwise::prepareParams() { } } + // FP32 constant inputs may contain values out of BF16 representable range. In case output precision is BF16 we + // choose "saturation" mode for fp32->bf16 conversion procedure to prevent getting -Inf/+Inf values in the + // outputs. Since "saturation" conversion is more time consuming, better solution would be to clamp constants on + // compilation stage (ticket: 159589). + key.doOutputSaturation = false; + for (size_t i = 0; i < getParentEdges().size(); i++) { + if (getParentEdgeAt(i)->getParent()->isConstant()) { + key.doOutputSaturation = true; + break; + } + } + auto cache = context->getParamsCache(); auto result = cache->getOrCreate(key, buildExecutor); execPtr = result.first; diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.h b/src/plugins/intel_cpu/src/nodes/eltwise.h index d0ca94e08824c8..8e5fd643665ffd 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.h +++ b/src/plugins/intel_cpu/src/nodes/eltwise.h @@ -43,6 +43,7 @@ struct jit_eltwise_params { size_t work_amount; bool use_runtime_ptrs; + bool do_output_saturation; }; struct jit_eltwise_call_args_indexes { diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 4bb2f714b284fd..34b659a1ef2882 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -11,6 +11,7 @@ #include "openvino/core/shape.hpp" #include "openvino/core/type/element_type.hpp" #include "shape_inference/shape_inference_pass_through.hpp" +#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp" using namespace dnnl; using namespace dnnl::impl::cpu::x64; @@ -226,7 +227,8 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr conte op::v0::Constant::get_type_info_static(), op::v0::Result::get_type_info_static(), op::v3::ReadValue::get_type_info_static(), - op::v6::ReadValue::get_type_info_static())) + op::v6::ReadValue::get_type_info_static(), + ov::intel_cpu::ReadValueWithSubgraph::get_type_info_static())) OPENVINO_THROW_NOT_IMPLEMENTED("CPU Input node doesn't support ngraph operation ", op->get_type_name(), " with name ", @@ -479,7 +481,11 @@ void Input::selectOptimalPrimitiveDescriptor() { supportedPrimitiveDescriptors.clear(); // and just use parent memory descriptor for Output node to avoid reorders insertion - NodeConfig config({PortConfig(getParentOutputMemDesc(getParentEdgeAt(0)), BlockedMemoryDesc::FULL_MASK, 0)}, {}); + std::vector inConfs; + for (size_t i = 0; i < getParentEdges().size(); i++) { + inConfs.push_back({PortConfig(getParentOutputMemDesc(getParentEdgeAt(i)), BlockedMemoryDesc::FULL_MASK, 0)}); + } + NodeConfig config(inConfs, {}); supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); selectPrimitiveDescriptorByIndex(0); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp index 1bf64d096e4a84..c4fb7608d521de 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp @@ -57,6 +57,7 @@ struct jit_eltwise_params { size_t work_amount; bool use_runtime_ptrs; + bool do_output_saturation; }; struct jit_eltwise_call_args_indexes { diff --git a/src/plugins/intel_cpu/src/nodes/memory.cpp b/src/plugins/intel_cpu/src/nodes/memory.cpp index 5a0bd7a1e3dff1..d9c9dba5a1219d 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.cpp +++ b/src/plugins/intel_cpu/src/nodes/memory.cpp @@ -10,8 +10,11 @@ #include "dnnl_extension_utils.h" #include "dnnl_types.h" #include "memory_desc/cpu_memory_desc_utils.h" +#include "nodes/common/cpu_convert.h" #include "scaled_attn.h" +#include "shape_inference/shape_inference_internal_dyn.hpp" #include "shape_inference/shape_inference_pass_through.hpp" +#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp" #include "utils/general_utils.h" using namespace dnnl; @@ -373,8 +376,10 @@ bool MemoryInputBase::isSupportedOperation(const std::shared_ptr try { if (!one_of(op->get_type_info(), ov::op::v3::ReadValue::get_type_info_static(), - ov::op::v6::ReadValue::get_type_info_static())) { - errorMessage = "Node is not an instance of ReadValue from the operation set v3 or v6."; + ov::op::v6::ReadValue::get_type_info_static(), + ov::intel_cpu::ReadValueWithSubgraph::get_type_info_static())) { + errorMessage = "Node is not an instance of ReadValue from the operation set v3 " + "or v6, or is not an instance of intel_cpu::ReadValueWithSubgraph"; return false; } } catch (...) { @@ -402,22 +407,26 @@ MemoryInputBase::MemoryInputBase(const std::string id, const Shape& output_shape, const ov::element::Type& output_prc, const GraphContext::CPtr context, - const ov::optional& input_shape, - const ov::optional& input_prc, + const ov::optional>& input_shape, + const ov::optional>& input_prc, MemoryInputBase::mode mode) : Input(output_shape, output_prc, name, type, context), MemoryStateNode(id) { outputShapes.emplace_back(output_shape); addOriginalOutputPrecision(output_prc); if (input_shape) { - inputShapes.push_back(*input_shape); - isDynamic = isDynamic || input_shape->isDynamic(); + for (auto inp_shape : *input_shape) { + inputShapes.push_back(inp_shape); + isDynamic = isDynamic || inp_shape.isDynamic(); + } if (isDynamic && !shapeInference) { shapeInference = PassThroughShapeInferFactory().makeShapeInfer(); } } if (input_prc) { - addOriginalInputPrecision(*input_prc); + for (auto inp_prc : *input_prc) { + addOriginalInputPrecision(inp_prc); + } } if (created()) { context->getMemoryStatesRegister()->registerInput(this); @@ -456,8 +465,11 @@ void MemoryInputBase::initSupportedPrimitiveDescriptors() { NodeConfig config; if (!getParentEdges().empty()) { - const auto& inputShape = getInputShapeAtPort(0); - config.inConfs.emplace_back(descCreators.at(LayoutType::ncsp)->createSharedDesc(precision, inputShape)); + for (size_t i = 0; i < getParentEdges().size(); i++) { + const auto& inputShape = getInputShapeAtPort(i); + auto inp_prc = getOriginalInputPrecisionAtPort(i); + config.inConfs.emplace_back(descCreators.at(LayoutType::ncsp)->createSharedDesc(inp_prc, inputShape)); + } } const auto& outputShape = getOutputShapeAtPort(0); @@ -562,6 +574,47 @@ void MemoryInputBase::bypassAssignState() { return; } +MemoryInput::MemoryInput(const std::shared_ptr& op, const GraphContext::CPtr ctx) + : MemoryInputBase::MemoryInputBase(op, ctx) { + auto rvWithSubgraph = ov::as_type_ptr(op); + if (rvWithSubgraph) { + body = rvWithSubgraph->get_function(); + subGraph = make_unique(); + if (isDynamic) { + shapeInference = InternalDynShapeInferFactory().makeShapeInfer(); + } + } +} + +MemoryInput::MemoryInput(const std::string id, + const std::string& name, + const std::string& type, + const Shape& output_shape, + const ov::element::Type& output_prc, + const GraphContext::CPtr context, + const ov::optional>& input_shape, + const ov::optional>& input_prc, + std::shared_ptr func, + mode mode) + : MemoryInputBase::MemoryInputBase(id, + name, + type, + output_shape, + output_prc, + context, + input_shape, + input_prc, + mode) { + body = func; + + if (haveSubgraph()) { + subGraph = make_unique(); + if (isDynamic) { + shapeInference = InternalDynShapeInferFactory().makeShapeInfer(); + } + } +} + bool MemoryInput::needInitGraphProcessing() const { return !getParentEdges().empty() && getAssignedState()->is_reset_state(); } @@ -620,6 +673,59 @@ void MemoryInput::initOptimalPrimitiveDescriptor() { config.outConfs.front().setMemDesc(mem_desc); // bypass any checks, we enforce the child descriptor selectedPd->setConfig(config); + + if (haveSubgraph()) { + // Adopt parent configuration, avoid to insert reorder before the MemoryInput. + std::vector graphInputConfig; + + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto desc = getParentOutputMemDesc(getParentEdgeAt(i)); + graphInputConfig.emplace_back(node::Input::InputConfig{desc, true}); + } + + std::vector graphOutputConfig; + for (auto&& portConfig : config.outConfs) { + auto desc = portConfig.getMemDesc(); + graphOutputConfig.emplace_back(node::Input::OutputConfig{desc, true}); + } + + // configure the inner graph to get the information about output memory descriptors + subGraph->Init(body, context, graphInputConfig, graphOutputConfig); + } +} + +// @todo add ascii diagramm for memory mapping / reuse +void MemoryInput::createPrimitive() { + MemoryInputBase::createPrimitive(); + if (haveSubgraph()) { + OPENVINO_ASSERT(getOriginalInputsNumber() == subGraph->inputsNumber(), + "Number of node inputs must be equal the number of inner graph's inputs: ", + getOriginalInputsNumber(), + " != ", + subGraph->inputsNumber()); + + std::vector inputMemory; + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + auto srcEdgeMem = getSrcMemoryAtPort(i); + // create a separate input memory objects instead of share them. avoid data corruption. + auto mem = std::make_shared(getEngine(), srcEdgeMem->getDescPtr(), srcEdgeMem->getMemoryBlock()); + subgraphMemoryPtrs.push_back(mem); + inputMemory.emplace_back(std::move(mem)); + } + + OPENVINO_ASSERT(getOriginalOutputsNumber() == subGraph->outputsNumber(), + "Number of node outputs must be equal the number of inner graph's outputs: ", + getOriginalOutputsNumber(), + " != ", + subGraph->outputsNumber()); + + std::vector outputMemory; + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + outputMemory.emplace_back(getDstMemoryAtPort(i)); + } + + subGraph->Activate(inputMemory, outputMemory); + } } void MemoryInput::runDynamic(dnnl::stream strm) { @@ -655,13 +761,43 @@ void MemoryInput::runDynamic(dnnl::stream strm) { memBlock->reset(); } - // reshape output - const auto& newDims = processInitGraph ? getSrcMemoryAtPort(0)->getStaticDims() : stateDims; + MemoryPtr src = assignedMem; // declare src memory + if (processInitGraph) { + if (haveSubgraph()) { + // put PrepareParams into runDynamic, because init graph is not called each time. + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + // since the external and internal descriptors are compatible, we may pass the descriptor + subgraphMemoryPtrs[i]->redefineDesc(getSrcMemoryAtPort(i)->getDescPtr()); + } + + subGraph->ResetInferCount(); + subGraph->Infer(); + // depending on the memory sharing solution, we can return here if the memory is substituted from the + // external graph or override the src pointer with the memory pointer pointing to the subgraph output + // memory + OPENVINO_ASSERT(subGraph->outputsNumber() == 1); + src = subGraph->getOutputNodeByIndex(0)->getSrcMemoryAtPort(0); + + // since the shape inference(InternalDynShapeInfer, do nothing) is performed, a memory of the extra child + // edges, attached to the output ports has to be updated after an inference of the inner graph finished + auto& childEdges = getChildEdges(); + for (size_t j = 1; j < childEdges.size(); j++) { + auto& childEdge = childEdges[j]; + auto childEdgePtr = childEdge.lock(); + assert(childEdgePtr); + assert(0 == childEdgePtr->getInputNum()); + childEdgePtr->getMemoryPtr()->redefineDesc(src->getDescPtr()); + } + } else { + src = getSrcMemoryAtPort(0); + } + } + // reshape output + const auto& newDims = src->getStaticDims(); redefineOutputMemory(0, newDims); // copy data when necessary - auto src = processInitGraph ? getSrcMemoryAtPort(0) : assignedMem; if (src->getData() != dst->getData()) { dst->load(*src); } @@ -692,10 +828,21 @@ void MemoryInput::runStatic(dnnl::stream strm) { memBlock->reset(); } - const auto processInitGraph = needInitGraphProcessing(); + const bool processInitGraph = needInitGraphProcessing(); + MemoryPtr src = assignedMem; // declare src memory + if (processInitGraph) { + if (haveSubgraph()) { + subGraph->ResetInferCount(); + subGraph->Infer(); + + OPENVINO_ASSERT(subGraph->outputsNumber() == 1); + src = subGraph->getOutputNodeByIndex(0)->getSrcMemoryAtPort(0); + } else { + src = getSrcMemoryAtPort(0); + } + } // copy data when necessary - auto src = processInitGraph ? getSrcMemoryAtPort(0) : assignedMem; auto dst = getDstMemoryAtPort(0); if (src->getData() != dst->getData()) { dst->load(*src); @@ -749,6 +896,10 @@ MemStatePtr MemoryInput::makeState() const { original_desc); } +std::shared_ptr MemoryInput::getSubGraph() { + return body; +} + bool MemoryInput::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { return MemoryInputBase::isSupportedOperation(op, errorMessage); } @@ -759,8 +910,8 @@ MemoryInputSDPA::MemoryInputSDPA(const std::string id, const Shape& output_shape, const ov::element::Type& output_prc, const GraphContext::CPtr context, - const ov::optional& input_shape, - const ov::optional& input_prc, + const ov::optional>& input_shape, + const ov::optional>& input_prc, const std::shared_ptr& sdpaNode) : MemoryInputBase(id, name, type, output_shape, output_prc, context, input_shape, input_prc), m_sdpaNode(sdpaNode) {} @@ -865,8 +1016,9 @@ MemoryInputSingle::MemoryInputSingle(const std::string id, const Shape& output_shape, const ov::element::Type& output_prc, const GraphContext::CPtr context, - const ov::optional& input_shape, - const ov::optional& input_prc) + const ov::optional>& input_shape, + const ov::optional>& input_prc, + std::shared_ptr func) : MemoryInput(id, name, type, @@ -875,6 +1027,7 @@ MemoryInputSingle::MemoryInputSingle(const std::string id, context, input_shape, input_prc, + func, MemoryInputBase::mode::single_read_value) {} MemStatePtr MemoryInputSingle::makeState() const { diff --git a/src/plugins/intel_cpu/src/nodes/memory.hpp b/src/plugins/intel_cpu/src/nodes/memory.hpp index 9c0c9664ce8a27..1d40849b0f3356 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.hpp +++ b/src/plugins/intel_cpu/src/nodes/memory.hpp @@ -4,6 +4,8 @@ #pragma once +#include + #include #include "input.h" @@ -162,8 +164,8 @@ class MemoryInputBase : public Input, public MemoryStateNode { const Shape& output_shape, const ov::element::Type& output_prc, const GraphContext::CPtr context, - const ov::optional& input_shape, - const ov::optional& input_prc, + const ov::optional>& input_shape, + const ov::optional>& input_prc, mode mode = mode::read_value_assign); protected: @@ -192,15 +194,30 @@ class MemoryInputBase : public Input, public MemoryStateNode { class MemoryInput : public MemoryInputBase { public: - using MemoryInputBase::MemoryInputBase; + MemoryInput(const std::shared_ptr& op, const GraphContext::CPtr ctx); + MemoryInput(const std::string id, + const std::string& name, + const std::string& type, + const Shape& output_shape, + const ov::element::Type& output_prc, + const GraphContext::CPtr context, + const ov::optional>& input_shape, + const ov::optional>& input_prc, + std::shared_ptr func = nullptr, + mode mode = mode::read_value_assign); + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; void initOptimalPrimitiveDescriptor() override; void resolveInPlaceEdges(Edge::LOOK look) override; + void createPrimitive() override; + MemStatePtr makeState() const override; + std::shared_ptr getSubGraph(); + protected: bool needInitGraphProcessing() const; void runStatic(dnnl::stream strm) override; @@ -210,7 +227,15 @@ class MemoryInput : public MemoryInputBase { void assignStateHook() override { /*pass*/ } + bool haveSubgraph() const { + return body != nullptr; + } + private: + std::shared_ptr body = nullptr; + std::unique_ptr subGraph = nullptr; + std::vector subgraphMemoryPtrs; + ProxyMemoryBlockPtr memBlock = nullptr; }; @@ -222,8 +247,9 @@ class MemoryInputSingle : public MemoryInput { const Shape& output_shape, const ov::element::Type& output_prc, const GraphContext::CPtr context, - const ov::optional& input_shape, - const ov::optional& input_prc); + const ov::optional>& input_shape, + const ov::optional>& input_prc, + std::shared_ptr func); static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -242,8 +268,8 @@ class MemoryInputSDPA : public MemoryInputBase { const Shape& output_shape, const ov::element::Type& output_prc, const GraphContext::CPtr context, - const ov::optional& input_shape, - const ov::optional& input_prc, + const ov::optional>& input_shape, + const ov::optional>& input_prc, const std::shared_ptr& sdpaNode); static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/memory_state_base.cpp b/src/plugins/intel_cpu/src/nodes/memory_state_base.cpp index 58d855a091d716..19d4863c3afbcb 100644 --- a/src/plugins/intel_cpu/src/nodes/memory_state_base.cpp +++ b/src/plugins/intel_cpu/src/nodes/memory_state_base.cpp @@ -11,10 +11,8 @@ using namespace ov::intel_cpu::node; MemoryNode::MemoryNode(const std::shared_ptr& op) { - if (auto assignOp = ov::as_type_ptr(op)) { + if (auto assignOp = std::dynamic_pointer_cast(op)) { m_id = assignOp->get_variable_id(); - } else if (auto readValueOp = ov::as_type_ptr(op)) { - m_id = readValueOp->get_variable_id(); } else { OPENVINO_THROW("Unexpected ov::Node type: ", op->get_type_info().name, " in MemoryNode"); } diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index db55c728df725e..b3c2aa0b298a5a 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -218,6 +218,8 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< ov::element::Type_t::i4, ov::element::Type_t::u8, ov::element::Type_t::i8, + ov::element::Type_t::f8e4m3, + ov::element::Type_t::f8e5m2, ov::element::Type_t::u16, ov::element::Type_t::i16, ov::element::Type_t::u32, diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.cpp new file mode 100644 index 00000000000000..39df4b6a29c099 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "read_value_with_subgraph.hpp" + +#include "itt.hpp" +#include "transformations/itt.hpp" + +ov::intel_cpu::ReadValueWithSubgraph::ReadValueWithSubgraph(const std::shared_ptr& variable, + std::shared_ptr body) { + m_variable = variable; + set_function(body); +} + +ov::intel_cpu::ReadValueWithSubgraph::ReadValueWithSubgraph(const std::shared_ptr& variable, + std::shared_ptr body, + const OutputVector& args) + : ReadValueWithSubgraph(variable, body) { + set_arguments(args); +} + +std::string ov::intel_cpu::ReadValueWithSubgraph::get_variable_id() const { + OPENVINO_ASSERT(m_variable, "Variable is not initialized. Variable_id is unavailable"); + return get_variable()->get_info().variable_id; +} + +void ov::intel_cpu::ReadValueWithSubgraph::set_input(const Output& value, + const std::shared_ptr& body_parameter) { + OPENVINO_ASSERT(body_parameter != nullptr, "Missing parameter! parameter is is nullptr!"); + auto param_index = m_bodies[0]->get_parameter_index(body_parameter); + + OPENVINO_ASSERT(param_index != -1, "Missing parameter ", body_parameter->get_friendly_name(), " for \'body\'!"); + + set_invariant_inputs(value, {body_parameter}); +} + +ov::Output ov::intel_cpu::ReadValueWithSubgraph::set_output( + const std::shared_ptr& body_result) { + OPENVINO_ASSERT(body_result != nullptr, "Incorrect result in \"body\"! Result cant be \'nullptr\'"); + auto result_id = m_bodies[0]->get_result_index(body_result); + + OPENVINO_ASSERT(result_id != -1, "Missing result ", body_result->get_friendly_name(), "in \'body\'!"); + + return set_body_outputs({body_result}); +} + +std::shared_ptr ov::intel_cpu::ReadValueWithSubgraph::clone_with_new_inputs( + const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(intel_cpu_ReadValueWithSubgraphNode_clone_with_new_inputs); + + check_new_args_count(this, new_args); + auto op = + std::make_shared(this->get_variable(), get_function()->clone(), new_args); + OPENVINO_ASSERT(op.get(), + op != nullptr, + "Cannot clone ", + description(), + " operation with name ", + get_friendly_name()); + op->set_output_size(m_output_descriptions[0].size()); + for (const auto& m_input_descr : m_input_descriptions[0]) { + op->m_input_descriptions[0].push_back(m_input_descr->copy()); + } + for (const auto& m_output_descr : m_output_descriptions[0]) { + op->m_output_descriptions[0].push_back(m_output_descr->copy()); + } + op->validate_and_infer_types(); + return op; +} + +bool ov::intel_cpu::ReadValueWithSubgraph::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(intel_cpu_ReadValueWithSubgraphNode_visit_attributes); + visitor.on_attribute("variable_id", m_variable); + + auto variable_info = m_variable->get_info(); + visitor.on_attribute("variable_type", variable_info.data_type); + visitor.on_attribute("variable_shape", variable_info.data_shape); + m_variable->update(variable_info); + + visitor.on_attribute("body", m_bodies[0]); + visitor.on_attribute("inputs", m_input_descriptions[0]); + visitor.on_attribute("outputs", m_output_descriptions[0]); + return true; +} + +void ov::intel_cpu::ReadValueWithSubgraph::validate_and_infer_types() { + INTERNAL_OP_SCOPE(intel_cpu_ReadValueWithSubgraphNode_validate_and_infer_types); + + NODE_VALIDATION_CHECK(this, + m_bodies.size() == 1, + "ReadValueWithSubgraph contains incorrect number of bodies:", + m_bodies.size()); + + validate_and_infer_type_body(get_function(), m_input_descriptions[0]); + + auto output_nodes = outputs(); + + auto outputs_map = get_mapping_outputs_on_body_description(m_output_descriptions[0]); + + // Checking each output + for (size_t output_index = 0; output_index < output_nodes.size(); ++output_index) { + NODE_VALIDATION_CHECK(this, + outputs_map.count(output_index) != 0, + "Incorrect associating in body! Output ", + output_index, + " is not associated with results in then_body!"); + + auto desc = outputs_map.at(output_index); + + auto node_result = m_bodies[0]->get_results().at(desc->m_body_value_index)->input_value(0); + + set_output_type(output_index, node_result.get_element_type(), node_result.get_partial_shape()); + } +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.hpp new file mode 100644 index 00000000000000..037f8eb302afcd --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "openvino/op/read_value.hpp" +#include "openvino/op/util/sub_graph_base.hpp" +#include "transformations/cpu_opset/common/op/submodel.hpp" + +namespace ov { +namespace intel_cpu { + +class ReadValueWithSubgraph : public ov::op::util::SubGraphOp, public ov::op::util::VariableExtension { +public: + OPENVINO_OP("ReadValueWithSubgraph", "cpu_plugin_opset"); + + ReadValueWithSubgraph() = default; + ReadValueWithSubgraph(const std::shared_ptr& variable, std::shared_ptr body); + ReadValueWithSubgraph(const std::shared_ptr& variable, + std::shared_ptr body, + const OutputVector& args); + + std::string get_variable_id() const override; + + void set_input(const Output& value, const std::shared_ptr& body_parameter); + + Output set_output(const std::shared_ptr& body_result); + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + bool visit_attributes(AttributeVisitor& visitor) override; + void validate_and_infer_types() override; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.cpp new file mode 100644 index 00000000000000..e2b283e65c8615 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.cpp @@ -0,0 +1,164 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "move_readvalue_inputs_to_subgraph.hpp" + +#include + +#include "itt.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/pass/constant_folding.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/rotary_positional_embeddings.hpp" +#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp" +#include "transformations/cpu_opset/common/op/sdpa.hpp" +#include "transformations/cpu_opset/common/op/submodel.hpp" +#include "transformations/rt_info/disable_fp16_compression.hpp" +#include "transformations/utils/gen_pattern.hpp" +#include "transformations/utils/utils.hpp" + +ov::intel_cpu::MoveReadValueInputsToSubgraph::MoveReadValueInputsToSubgraph() { + MATCHER_SCOPE(MoveReadValueInputsToSubgraph); + using namespace ov::pass::pattern; + + auto readvalue_pattern = pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [=](Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + auto readvalue = as_type_ptr(pattern_map.at(readvalue_pattern).get_node_shared_ptr()); + if (!readvalue || readvalue->get_input_size() != 1u) { + return false; + } + + if (readvalue->get_rt_info().count("DisableInitSubgraphFusing") && + readvalue->get_rt_info()["DisableInitSubgraphFusing"].as()) { + return false; + } + + NodeVector subgraph_nodes; + std::unordered_set> visited_path_to_output; // Cache nodes which connect to Output. + std::unordered_set> visited_path_to_rv; // Cache nodes which connect to ReadValue. + NodeVector inputs = {}; + OutputVector outputs = {}; + + // DFS, Check if current node's final successor is only ReadValue. + std::function, bool&)> dfs = [&](std::shared_ptr node, + bool& found_output) { + if (found_output) { + return; + } + + if (visited_path_to_output.find(node) != visited_path_to_output.end()) { + found_output = true; + return; + } + + if (visited_path_to_rv.find(node) != visited_path_to_rv.end()) { + return; + } + + // node is Output + if (node->get_output_target_inputs(0).size() == 0u) { + found_output = true; + return; + } + + bool any_child_on_output_path = false; + for (const auto& child : node->get_output_target_inputs(0)) { + auto son = child.get_node()->shared_from_this(); + if (son == readvalue) { + continue; + } + + bool new_found_output = false; + dfs(son, new_found_output); + if (new_found_output) { + any_child_on_output_path = true; + } + } + + if (any_child_on_output_path) { + visited_path_to_output.insert(node); + found_output = any_child_on_output_path; + } + }; + + std::function)> reverse_dfs = [&](std::shared_ptr node) { + if (visited_path_to_output.find(node) != visited_path_to_output.end()) { + inputs.emplace_back(node); + return; + } + + if (visited_path_to_rv.find(node) != visited_path_to_rv.end()) { + return; + } + + if (ov::op::util::is_parameter(node)) { + inputs.emplace_back(node); + return; + } + + // Check if the current node has path(bypassing the ReadValue node) to the Output node via dfs algorithm. + bool found_output = false; // Flag: find Output node + dfs(node, found_output); + + if (found_output) { + inputs.emplace_back(node); + visited_path_to_output.insert(node); + return; + } + + visited_path_to_rv.insert(node); + + // Cache to subgraph_nodes + subgraph_nodes.emplace_back(node); + + for (size_t i = 0; i < node->get_input_size(); i++) { + reverse_dfs(node->get_input_node_shared_ptr(i)); + } + }; + + // Reverse DFS ReadValue, find all suitable nodes and move them to subgraph_nodes. + reverse_dfs(readvalue->get_input_node_shared_ptr(0)); + + if (inputs.size() == 0 || subgraph_nodes.size() == 0) { + return false; + } + + // Subgraph's input + auto params = ParameterVector{}; + for (auto inp : inputs) { + auto param = + std::make_shared(inp->get_element_type(), inp->get_output_partial_shape(0)); + params.push_back(param); + for (const auto& child : inp->get_output_target_inputs(0)) { + auto it = std::find(subgraph_nodes.begin(), subgraph_nodes.end(), child.get_node()->shared_from_this()); + if (it != subgraph_nodes.end()) { + child.replace_source_output(param); + } + } + } + + // Subgraph's output + auto last_node = readvalue->get_input_node_shared_ptr(0); + auto output = std::make_shared(last_node); + auto func = std::make_shared(ov::ResultVector({output}), params, "state_init_submodel"); + + auto new_rv = std::make_shared(readvalue->get_variable(), func); + + for (size_t i = 0; i < inputs.size(); i++) { + new_rv->set_input(inputs[i]->output(0), params[i]); + } + new_rv->set_output(output); + + // Replace ReadValue with ov::intel_cpu::ReadValueWithSubgraph + ov::replace_node(readvalue, new_rv); + ov::copy_runtime_info(subgraph_nodes, new_rv); + new_rv->validate_and_infer_types(); + return true; + }; + + auto m = std::make_shared(readvalue_pattern, matcher_name); + this->register_matcher(m, callback); +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.hpp new file mode 100644 index 00000000000000..220003cc83ead1 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace intel_cpu { + +/** + * @brief Move ReadValue's inputs inside the new CPU ngraph node:ReadValueWithSubgraph op. + * intput1 + * | + * Some nodes(They have only one common successor[ReadValue]) input1 + * | | + * ReadValue -------> ReadValueWithSubgraph(Subgraph is inside) + * | \ | \ + * Assign others Assign others + */ + +class MoveReadValueInputsToSubgraph : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("MoveReadValueInputsToSubgraph", "0"); + MoveReadValueInputsToSubgraph(); +}; + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index 0ec2049d1ccc1c..447adb0b2fe23f 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -270,6 +270,13 @@ StatefulSDPAFusion::StatefulSDPAFusion() { else assign_v_node->set_arguments({new_node->output(2)}); + // Markup pattern: + // ReadValue->Convert(Optional)->ScaledDotProductAttentionWithKVCache->Convert(Optional)->Assign, so that + // ReadValue can't be replaced with ReadValueWithSubgraph in this pattern. + // TODO: Temporarily skip this pattern. If MemoryInputSDPA supports Subgraph in the future, it may be deleted. + past_k_node->get_rt_info()["DisableInitSubgraphFusing"] = true; + past_v_node->get_rt_info()["DisableInitSubgraphFusing"] = true; + return true; }; diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index 614f7d690f8726..5142ee319ac523 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -10,6 +10,7 @@ #include "common/pass/convert_to_swish_cpu.hpp" #include "common/pass/fc_bias_fusion.hpp" #include "common/pass/move_fc_reshape_to_weights.hpp" +#include "common/pass/move_readvalue_inputs_to_subgraph.hpp" #include "common/pass/rnn_sequences_optimization.hpp" #include "config.h" #include "itt.hpp" @@ -70,6 +71,7 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr& model, const C false); CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate); CPU_REGISTER_PASS_COMMON(manager, ov::pass::EliminateConvert); // Need to clean up after the ConvertPrecision. + CPU_REGISTER_PASS_COMMON(manager, MoveReadValueInputsToSubgraph); manager.run_passes(model); } diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index a63377312ecb95..fb9e0925bc89e2 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -37,6 +37,7 @@ #include "transformations/common_optimizations/nop_elimination.hpp" #include "transformations/common_optimizations/reshape_prelu.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" +#include "transformations/common_optimizations/sdpa_fusion.hpp" #include "transformations/common_optimizations/transpose_sinking.hpp" #include "transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp" #include "transformations/common_optimizations/wrap_interpolate_into_transposes.hpp" @@ -695,6 +696,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_DISABLE_PASS_COMMON(manager, ov::pass::MatMulConstTransposesExtraction); CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertScatterNDUpdate15ToScatterNDUpdate3); CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertSliceScatter); + CPU_DISABLE_PASS_COMMON(manager, ov::pass::SDPAFusion); CPU_DISABLE_PASS_X64(manager, ov::pass::HSigmoidDecomposition); CPU_DISABLE_PASS_X64(manager, ov::pass::ReduceL1Decomposition); diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp index 4989fb3a0f04b7..a3c1f9ef7d3544 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp @@ -16,11 +16,45 @@ using namespace CPUTestUtils; namespace ov { namespace test { +static std::string special_value_to_string(const ov::test::SpecialValue& value) { + if (value == SpecialValue::none) { + return "none"; + } else if (value == SpecialValue::nan) { + return "nan"; + } else if (value == SpecialValue::inf) { + return "inf"; + } else if (value == SpecialValue::overflow) { + return "overflow"; + } + return "unknown"; +} + +template +static T set_special_value(T& value, const ov::test::SpecialValue& special_value) { + if (special_value == ov::test::SpecialValue::nan) { + value = NAN; + } else if (special_value == ov::test::SpecialValue::inf) { + value = INFINITY; + } else if (special_value == ov::test::SpecialValue::overflow) { + value = value + std::numeric_limits::max(); + } + return value; +} + +template +static void modify_value(ov::Tensor& tensor, const ov::test::SpecialValue& special_value) { + T* dataPtr = static_cast(tensor.data()); + for (size_t i = 0; i < tensor.get_size(); i++) { + set_special_value(dataPtr[i], special_value); + } +} + std::string ConvertCPULayerTest::getTestCaseName(testing::TestParamInfo obj) { InputShape inputShape; ov::element::Type inPrc, outPrc; + ov::test::SpecialValue special_value; CPUSpecificParams cpuParams; - std::tie(inputShape, inPrc, outPrc, cpuParams) = obj.param; + std::tie(inputShape, inPrc, outPrc, special_value, cpuParams) = obj.param; std::ostringstream result; @@ -30,6 +64,7 @@ std::string ConvertCPULayerTest::getTestCaseName(testing::TestParamInfo(inPrc, shape)); @@ -101,6 +146,31 @@ void ConvertCPULayerTest::SetUp() { function = makeNgraphFunction(inPrc, params, conversion, "ConversionCPU"); } +void ConvertCPULayerTest::generate_inputs(const std::vector& targetInputStaticShapes) { + inputs.clear(); + const auto& funcInputs = function->inputs(); + for (size_t i = 0; i < funcInputs.size(); ++i) { + const auto& funcInput = funcInputs[i]; + ov::Tensor tensor = + ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]); + if (special_value != ov::test::SpecialValue::none) { + if (inPrc == ov::element::f32) { + modify_value(tensor, special_value); + } else if (inPrc == ov::element::f16) { + modify_value(tensor, special_value); + } else if (inPrc == ov::element::bf16) { + modify_value(tensor, special_value); + } else if (inPrc == ov::element::f8e4m3) { + modify_value(tensor, special_value); + } else if (inPrc == ov::element::f8e5m2) { + modify_value(tensor, special_value); + } + } + + inputs.insert({funcInput.get_node_shared_ptr(), tensor}); + } +} + void ConvertCPULayerTest::validate_out_prc() const { if (outPrc == ov::element::boolean) FAIL() << "ConvertCPULayerTest supports only non boolean output prc"; diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp index a53f56f873151c..a4f4e0fc56c238 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp @@ -13,9 +13,12 @@ using namespace CPUTestUtils; namespace ov { namespace test { +enum SpecialValue { none, nan, inf, overflow }; + using convertLayerTestParamsSet = std::tuple; class ConvertCPULayerTest : public testing::WithParamInterface, @@ -25,9 +28,12 @@ class ConvertCPULayerTest : public testing::WithParamInterface& targetInputStaticShapes) override; virtual void validate_out_prc() const; ov::element::Type inPrc, outPrc; +private: + ov::test::SpecialValue special_value; }; class ConvertToBooleanCPULayerTest : public ConvertCPULayerTest { diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp index 11e0440b2e3618..e5d87f5cb2f3dd 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp @@ -16,6 +16,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_7D_Dynamic, ConvertCPULayerTe ::testing::ValuesIn(inShapes_7D_dynamic()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({}, {}, {}, {}))), ConvertCPULayerTest::getTestCaseName); @@ -24,6 +25,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_7D_Static, ConvertCPULayerTes ::testing::ValuesIn(inShapes_7D_static()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({}, {}, {}, {}))), ConvertCPULayerTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp index 59ca1065bf78d9..8181304bf95e7d 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp @@ -31,6 +31,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_4D_Dynamic, ConvertCPULayerTe ::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::ValuesIn(memForm4D_dynamic)), ConvertCPULayerTest::getTestCaseName); @@ -39,6 +40,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_4bit_Dynamic, ConvertCPULayer ::testing::Combine(::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn({ov::element::u4, ov::element::i4}), ::testing::ValuesIn({ov::element::f32, ov::element::bf16, ov::element::u8, ov::element::i8}), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {"ref"}))), ConvertCPULayerTest::getTestCaseName); @@ -52,9 +54,69 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_4D_Static, ConvertCPULayerTes ::testing::ValuesIn(inShapes_4D_static()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::ValuesIn(memForm4D_static_common)), ConvertCPULayerTest::getTestCaseName); +const std::vector float_precisions = { + ov::element::f32, + ov::element::f16, + ov::element::bf16, +}; + +const std::vector f8_precisions = { + ov::element::f8e4m3, + ov::element::f8e5m2, +}; + +const std::vector specialValue = { + ov::test::SpecialValue::none, + ov::test::SpecialValue::nan, + ov::test::SpecialValue::inf, + ov::test::SpecialValue::overflow, +}; + +std::vector memForm4D_fp8 = { + CPUSpecificParams({nchw}, {nchw}, {}, expectedPrimitiveType()), + CPUSpecificParams({nhwc}, {nhwc}, {}, expectedPrimitiveType()), +}; + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_from_fp8_Static, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_static()), + ::testing::ValuesIn(f8_precisions), + ::testing::ValuesIn(float_precisions), + ::testing::ValuesIn(specialValue), + ::testing::ValuesIn(memForm4D_fp8)), + ConvertCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_to_fp8_Static, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_static()), + ::testing::ValuesIn(float_precisions), + ::testing::ValuesIn(f8_precisions), + ::testing::ValuesIn(specialValue), + ::testing::ValuesIn(memForm4D_fp8)), + ConvertCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_from_fp8_Dynamic, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_dynamic()), + ::testing::ValuesIn(f8_precisions), + ::testing::ValuesIn(float_precisions), + ::testing::ValuesIn(specialValue), + ::testing::ValuesIn(memForm4D_fp8)), + ConvertCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_to_fp8_Dynamic, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_dynamic()), + ::testing::ValuesIn(float_precisions), + ::testing::ValuesIn(f8_precisions), + ::testing::ValuesIn(specialValue), + ::testing::ValuesIn(memForm4D_fp8)), + ConvertCPULayerTest::getTestCaseName); + } // namespace Conversion } // namespace test } // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp index 9c34d6220d4b2d..ab1e06639c5a3e 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp @@ -23,6 +23,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_blocked_Dynamic, ConvertCPULa ::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::ValuesIn(memForm4D_dynamic)), ConvertCPULayerTest::getTestCaseName); @@ -44,6 +45,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_Blocked, ConvertCPULayerTest, ::testing::ValuesIn(inShapes_4D_blocked), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::ValuesIn(filterCPUSpecificParams(memForm4D_static_blocked))), ConvertCPULayerTest::getTestCaseName); @@ -52,6 +54,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_BOOL_Static, ConvertToBoolean ::testing::ValuesIn(inShapes_4D_static()), ::testing::ValuesIn(precisions_floating_point), ::testing::Values(ov::element::boolean), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {}))), ConvertToBooleanCPULayerTest::getTestCaseName); @@ -60,6 +63,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_BOOL_Dynamic, ConvertToBoolea ::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn(precisions_floating_point), ::testing::Values(ov::element::boolean), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {}))), ConvertToBooleanCPULayerTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/stateful_init_graph.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/stateful_init_graph.cpp new file mode 100644 index 00000000000000..9186b43d3d863e --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/stateful_init_graph.cpp @@ -0,0 +1,314 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "common_test_utils/ov_tensor_utils.hpp" +#include "utils/cpu_test_utils.hpp" + +using namespace ov::test; +using namespace CPUTestUtils; +using InitGraphStatefulModelTestParams = std::tuple, // input shapes + bool // ReadValue Assgin Direct pair or not + >; +class InitGraphStatefulModelBase : virtual public ov::test::SubgraphBaseTest, + public testing::WithParamInterface, + public CPUTestsBase { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + std::ostringstream result; + + std::vector inputShapes; + bool directPair; + std::tie(inputShapes, directPair) = obj.param; + + result << "IS="; + for (const auto& shape : inputShapes) { + result << ov::test::utils::partialShape2str({shape.first}) << "_"; + } + result << "TS="; + for (const auto& shape : inputShapes) { + result << "("; + if (!shape.second.empty()) { + for (const auto& itr : shape.second) { + result << ov::test::utils::vec2str(itr); + } + } + result << ")"; + } + result << "_DirectAssign=" << ov::test::utils::bool2str(directPair); + result << ")"; + + return result.str(); + } + + std::vector calculate_refs() override { + for (const auto& param : functionRefs->get_parameters()) { + inferRequestRef.set_tensor(param->get_default_output(), inputs.at(matched_parameters[param])); + } + inferRequestRef.infer(); + + auto outputs = std::vector{}; + for (const auto& output : functionRefs->outputs()) { + outputs.push_back(inferRequestRef.get_tensor(output)); + } + + return outputs; + } + + std::vector get_plugin_outputs() override { + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + } + inferRequest.infer(); + auto outputs = std::vector{}; + for (const auto& output : function->outputs()) { + outputs.push_back(inferRequest.get_tensor(output)); + } + return outputs; + } + + void run() override { + prepare(); + + auto&& states = inferRequest.query_state(); + auto&& refStates = inferRequestRef.query_state(); + + for (size_t i = 0; i < targetStaticShapes.size(); i++) { + for (auto iters = 0; iters < 5; iters++) { + generate_inputs(targetStaticShapes[i]); + + if (iters & 0x1) { + states.front().reset(); + refStates.front().reset(); + } else { + // generate and set state tensors every even iteration + using ov::test::utils::InputGenerateData; + + auto stateShape = get_state_shape(i); + auto tensor = utils::create_and_fill_tensor(statePrc, + stateShape, + InputGenerateData{0, 1, 1, iters}); + states.front().set_state(tensor); + refStates.front().set_state(tensor); + } + + validate(); + } + } + } + +protected: + virtual void check_init_graph_node() = 0; + + virtual ov::Shape get_state_shape(size_t i) = 0; + + void prepare() { + compile_model(); + + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + + check_init_graph_node(); + + // ref + functionRefs = function->clone(); + + matched_parameters.clear(); + const auto& ref_params = functionRefs->get_parameters(); + const auto& params = function->get_parameters(); + for (size_t in_idx = 0; in_idx < params.size(); ++in_idx) { + matched_parameters.insert({ref_params[in_idx], params[in_idx]}); + } + + auto compiledModelRef = core->compile_model(functionRefs, ov::test::utils::DEVICE_TEMPLATE); + inferRequestRef = compiledModelRef.create_infer_request(); + } + + std::vector inputShapes; + const ov::element::Type netPrc = ElementType::f32; + ov::InferRequest inferRequestRef; + ov::element::Type statePrc; +}; + +// ReadValue Assign direct pair +// +// input_1 input_2 +// | | +// Add_1 / +// \ / +// MatMul +// | +// input_0 ReadValue .......... +// \ / \ . +// Add_0 Assign ........ +// | +// Result + +class InitGraphStatefulModel : public InitGraphStatefulModelBase { +public: + void SetUp() override { + targetDevice = utils::DEVICE_CPU; + + bool directPair; + std::tie(inputShapes, directPair) = this->GetParam(); + + init_input_shapes(inputShapes); + ov::ParameterVector input_params; + for (auto&& shape : inputDynamicShapes) { + input_params.push_back(std::make_shared(netPrc, shape)); + } + + input_params[0]->set_friendly_name("input_0"); + input_params[1]->set_friendly_name("input_1"); + input_params[2]->set_friendly_name("input_2"); + + // init_graph + auto add_1 = + std::make_shared(input_params[1], ov::op::v0::Constant::create(netPrc, {1}, {1.0f})); + add_1->set_friendly_name("init_graph/add_1"); + auto mm_0 = std::make_shared(add_1, input_params[2]); + mm_0->set_friendly_name("init_graph/mm_0"); + + const std::string variable_name("var_direct_pair"); + statePrc = netPrc; + auto variable = std::make_shared( + ov::op::util::VariableInfo{{inputDynamicShapes[1][0], inputDynamicShapes[2][1]}, statePrc, variable_name}); + + auto read = std::make_shared(mm_0, variable); + std::shared_ptr add_0 = std::make_shared(input_params[0], read); + add_0->set_friendly_name("add_0"); + auto assign = std::make_shared(directPair ? read : add_0, variable); + auto res = std::make_shared(add_0); + function = std::make_shared(ov::ResultVector({res}), ov::SinkVector({assign}), input_params); + } + + void check_init_graph_node() override { + // Node with friendly name "init_graph/add_1" and init_graph/mm_0 should be moved into subgraph. + CheckNumberOfNodesWithType(compiledModel, "Add", 0); + CheckNumberOfNodesWithType(compiledModel, "MatMul", 0); + } + + ov::Shape get_state_shape(size_t i) override { + return ov::Shape({inputShapes[1].second[i][0], inputShapes[2].second[i][1]}); + } +}; + +TEST_P(InitGraphStatefulModel, CompareWithRefs) { + run(); +} + +// ReadValueWithSubgraph have different precision. +// +// input[fp32] +// | +// Convert[fp32->fp16] +// | +// ReadValue .......... +// / \ . +// Add Assign ....... +// | +// Result +class InitGraphStatefulDiffPrimitiveModel : public InitGraphStatefulModelBase { +public: + void SetUp() override { + targetDevice = utils::DEVICE_CPU; + + configuration.insert({"SNIPPETS_MODE", "DISABLE"}); + + bool directPair; + std::tie(inputShapes, directPair) = this->GetParam(); + + init_input_shapes(inputShapes); + ov::ParameterVector input_params; + for (auto&& shape : inputDynamicShapes) { + input_params.push_back(std::make_shared(netPrc, shape)); + } + + input_params[0]->set_friendly_name("input"); + + // init_graph + auto convert = std::make_shared(input_params[0], ov::element::f16); + convert->set_friendly_name("init_graph/convert"); + + const std::string variable_name("var_diff_precison"); + statePrc = ov::element::f16; + auto variable = std::make_shared( + ov::op::util::VariableInfo{{inputDynamicShapes[0]}, statePrc, variable_name}); + + auto readvalue = std::make_shared(convert, variable); + + std::shared_ptr add = + std::make_shared(readvalue, ov::op::v0::Constant::create(ov::element::f16, {1}, {1.0f})); + + auto assign = std::make_shared(directPair ? readvalue : add, variable); + + auto res = std::make_shared(add); + + function = std::make_shared(ov::ResultVector({res}), ov::SinkVector({assign}), input_params); + } + + void check_init_graph_node() override { + CheckNumberOfNodesWithType(compiledModel, "Convert", 1); + } + + ov::Shape get_state_shape(size_t i) override { + return inputShapes[0].second[i]; + } +}; + +TEST_P(InitGraphStatefulDiffPrimitiveModel, CompareWithRefs) { + run(); +} + +namespace { +const std::vector> inputShapes = { + { + // Dynamic shape. + {{1, -1}, {{1, 2}, {1, 2}, {1, 1}}}, + {{2, -1}, {{2, 3}, {2, 10}, {2, 1}}}, + {{-1, 2}, {{3, 2}, {10, 2}, {1, 2}}}, + }, + { + // Static shape. + {{1, 1}, {{1, 1}}}, + {{4, 2}, {{4, 2}}}, + {{2, 10}, {{2, 10}}}, + } +}; + +const std::vector readValueAssginDirectPair = {true, false}; + +const auto testParams_smoke = ::testing::Combine( + ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(readValueAssginDirectPair)); + +INSTANTIATE_TEST_SUITE_P(smoke_StatefulInitGraph, + InitGraphStatefulModel, + testParams_smoke, + InitGraphStatefulModel::getTestCaseName); + + +const std::vector> inputShapesDiffPrecision = { + { + // Dynamic shape. + {{1, -1}, {{1, 10}, {1, 1}}}, + }, + { + // Static shape. + {{1, 1}, {{1, 1}}}, + } +}; + +const auto testParamsDiffPrecision_smoke = ::testing::Combine( + ::testing::ValuesIn(inputShapesDiffPrecision), + ::testing::ValuesIn(readValueAssginDirectPair)); + +INSTANTIATE_TEST_SUITE_P(smoke_StatefulInitGraph, + InitGraphStatefulDiffPrimitiveModel, + testParamsDiffPrecision_smoke, + InitGraphStatefulDiffPrimitiveModel::getTestCaseName); + +} // namespace + diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/bf16_convert_saturation.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/bf16_convert_saturation.cpp new file mode 100644 index 00000000000000..96c08eeffed15a --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/bf16_convert_saturation.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/ov_tensor_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "utils/fusing_test_utils.hpp" + +using namespace CPUTestUtils; +namespace ov { +namespace test { +/* + This test aims to cover Eltwise node BF16 output precision conversion logic in "saturation" mode. In this test, we + have a select node with condition input of boolean type and then/else inputs of f32 type(as constant node with bf16 + overflow data). The select node is followed by a convolution node to ensoure that it is converted to bf16 precision. +*/ +using selectParams = std::tuple; +class BF16ConvertSaturation : public testing::WithParamInterface, + virtual public SubgraphBaseTest, + public CpuTestWithFusing { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + InputShape shapes; + ElementType precision; + std::tie(shapes, precision) = obj.param; + + std::ostringstream result; + result << "Condition_prc_" << ElementType::boolean << "_Then_Else_prc_" << precision << "_"; + result << "IS=(" << shapes.first << ")_TS=("; + for (const auto& item : shapes.second) { + result << ov::test::utils::vec2str(item) << "_"; + } + result << "PluginConf_inference_precision=bf16"; + + return result.str(); + } + +protected: + void SetUp() override { + abs_threshold = 0; + targetDevice = ov::test::utils::DEVICE_CPU; + InputShape shapes; + ElementType precision; + std::tie(shapes, precision) = this->GetParam(); + init_input_shapes({shapes}); + std::tie(inFmts, outFmts, priority, selectedType) = emptyCPUSpec; + selectedType = makeSelectedTypeStr(getPrimitiveType(), ov::element::i8); + ov::element::TypeVector types{ov::element::boolean, precision, precision}; + ov::ParameterVector parameters; + auto param = std::make_shared(ov::element::boolean, inputDynamicShapes[0]); + parameters.push_back(param); + + ov::test::utils::InputGenerateData in_data; + in_data.start_from = -3.40282e+38; + in_data.range = 1; + in_data.resolution = 1; + auto thenTensor = ov::test::utils::create_and_fill_tensor(precision, ov::Shape{1}, in_data); + + in_data.start_from = 3.40282e+38; + in_data.range = 10; + in_data.resolution = 2; + auto elseTensor = ov::test::utils::create_and_fill_tensor(precision, ov::Shape{2, 1, 32, 32}, in_data); + + auto select = std::make_shared(parameters[0], + std::make_shared(thenTensor), + std::make_shared(elseTensor), + ov::op::AutoBroadcastType::NUMPY); + + auto conv_filter_shape = ov::Shape{1, 1, 3, 3}; + auto conv_filter = ov::op::v0::Constant::create(ElementType::f32, conv_filter_shape, {1}); + auto strides = ov::Strides{1, 1}; + auto pads_begin = ov::CoordinateDiff{0, 0}; + auto pads_end = ov::CoordinateDiff{0, 0}; + auto dilations = ov::Strides{1, 1}; + auto conv = + std::make_shared(select, conv_filter, strides, pads_begin, pads_end, dilations); + + function = makeNgraphFunction(ElementType::f32, parameters, conv, "Eltwise"); + configuration.insert({ov::hint::inference_precision(ov::element::bf16)}); + } + + void generate_inputs(const std::vector& targetInputStaticShapes) override { + inputs.clear(); + const auto& modelInputs = function->inputs(); + ov::test::utils::InputGenerateData in_data; + in_data.start_from = -1; + in_data.range = 3; + in_data.resolution = 2; + auto condTensor = ov::test::utils::create_and_fill_tensor(modelInputs[0].get_element_type(), + targetInputStaticShapes[0], + in_data); + + inputs.insert({modelInputs[0].get_node_shared_ptr(), condTensor}); + } +}; + +TEST_P(BF16ConvertSaturation, CompareWithRefs) { + run(); +} + +const std::vector inShapes = { + // Condition + {{-1, -1, -1, -1}, {{2, 1, 32, 32}}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_BF16ConvertSaturationTest, + BF16ConvertSaturation, + ::testing::Combine(::testing::ValuesIn(inShapes), ::testing::Values(ElementType::f32)), + BF16ConvertSaturation::getTestCaseName); + +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp index 9ff4d0b989fefa..903b8c083b1a1f 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp @@ -32,6 +32,17 @@ const std::vector types = { ov::element::f64, }; +const std::vector floatTypes = { + ov::element::f32, + ov::element::f16, + ov::element::bf16, +}; + +const std::vector f8Types = { + ov::element::f8e4m3, + ov::element::f8e5m2, +}; + INSTANTIATE_TEST_SUITE_P(smoke_ConversionLayerTest, ConversionLayerTest, ::testing::Combine(::testing::ValuesIn(conversionOpTypes), @@ -49,4 +60,23 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConversionToBooleanLayerTest, ::testing::Values(ov::element::boolean), ::testing::Values(ov::test::utils::DEVICE_CPU)), ConversionLayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConversionToF8LayerTest, + ConversionLayerTest, + ::testing::Combine(::testing::Values(conversionOpTypes[0]), + ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(shapes)), + ::testing::ValuesIn(floatTypes), + ::testing::ValuesIn(f8Types), + ::testing::Values(ov::test::utils::DEVICE_CPU)), + ConversionLayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConversionFromF8LayerTest, + ConversionLayerTest, + ::testing::Combine(::testing::Values(conversionOpTypes[0]), + ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(shapes)), + ::testing::ValuesIn(f8Types), + ::testing::ValuesIn(floatTypes), + ::testing::Values(ov::test::utils::DEVICE_CPU)), + ConversionLayerTest::getTestCaseName); + } // namespace diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 7af707df602bfc..4c34b3fd2506ac 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -173,6 +173,8 @@ std::vector disabledTestPatterns() { R"(.*smoke_TopK/TopKLayerTest.Inference.*_k=21_.*_sort=value_modelType=f16_trgDev=CPU.*)", // Issue: 121812 R"(.*ConvertCPULayerTest.*outFmts=(nhwc|nChw8c|nChw16c).*)", + // Issue: MFDNN-12917. The oneDNN emitter of conversion from fp32 to fp8 has rounding issue. + R"(.*ConvertCPULayerTest.*(\[1.1.1080.1920\]|\(2.17.5.4\))_.*_inputPRC=f32_targetPRC=f8e4m3_.*)", // Need to generate sequence exactly in the i64 data type. Enable in scope of i64 enabling. R"(.*RandomUniformLayerTestCPU.*OutPrc=i64.*)", // Issue: 123815 (Tests are sensintive to available thread count on testing machines) @@ -529,6 +531,7 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)"); retVector.emplace_back(R"(.*ConcatSDPTest.*f16.*)"); + retVector.emplace_back(R"(.*ConvertCPULayerTest.*f16.*)"); } #elif defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM) if (!ov::intel_cpu::hasHardwareSupport(ov::element::f16)) { @@ -536,6 +539,7 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); retVector.emplace_back(R"(.*Prc=f16.*)"); retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*HasShapeOf=1.*)"); + retVector.emplace_back(R"(.*ConvertCPULayerTest.*f16.*)"); } else { // Issue 117407 retVector.emplace_back( diff --git a/src/plugins/intel_cpu/tests/unit/transformations/readvalue_subgraph.cpp b/src/plugins/intel_cpu/tests/unit/transformations/readvalue_subgraph.cpp new file mode 100644 index 00000000000000..3656130b579edd --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/transformations/readvalue_subgraph.cpp @@ -0,0 +1,232 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include + +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/read_value.hpp" +#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp" + +using namespace testing; +/**************************************************************** + * Pattern 1 (From whisper decoder): + * input input + * | | + * MatMul ReadValueWithSubgraph (MatMul) + * | -> | \ + * ReadValue Result Assign + * | \ + * Result Assign + ****************************************************************/ +static std::shared_ptr constructRVWithSubGraph( + std::shared_ptr input, + const ov::element::Type& type, + std::shared_ptr variable) { + auto mm_weights = std::make_shared(type, ov::Shape{2, 2}, std::vector{1, 2, 3, 4}); + + auto func_input = + std::make_shared(input->get_element_type(), input->get_output_partial_shape(0)); + + auto matmul = std::make_shared(func_input, mm_weights, false, false); + + auto func_output = std::make_shared(matmul); + + auto func = std::make_shared(ov::NodeVector({func_output}), + ov::ParameterVector{func_input}, + "state_init_submodel"); + + auto readvalue = std::make_shared(variable, func); + readvalue->set_input(input->output(0), func_input); + readvalue->set_output(func_output); + readvalue->validate_and_infer_types(); + + return readvalue; +} + +TEST(TransformationTests, ReadValueWithSubgraph_1) { + std::shared_ptr model(nullptr), model_ref(nullptr); + { + const ov::PartialShape shape{1, 1, 2}; + const ov::element::Type type = ov::element::f32; + std::shared_ptr variable = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape{1, 1, 2}, type, "var_id"}); + + { + auto input = std::make_shared(type, shape); + + auto mm_weights = + std::make_shared(type, ov::Shape{2, 2}, std::vector{1, 2, 3, 4}); + + auto matmul = std::make_shared(input, mm_weights, false, false); + + auto readvalue = std::make_shared(matmul, variable); + + auto assign = std::make_shared(readvalue, variable); + + auto result = std::make_shared(readvalue); + model = std::make_shared(ov::ResultVector{result}, + ov::SinkVector{assign}, + ov::ParameterVector{input}); + + ov::pass::Manager manager; + manager.register_pass(); + manager.run_passes(model); + } + { + auto input = std::make_shared(type, shape); + + auto readvalue = constructRVWithSubGraph(input, type, variable); + + auto assign = std::make_shared(readvalue, variable); + + auto result = std::make_shared(readvalue); + + model_ref = std::make_shared(ov::ResultVector{result}, + ov::SinkVector{assign}, + ov::ParameterVector{input}); + } + auto res = compare_functions(model, model_ref, 0, 0, 0, 0, 0, 0); + ASSERT_TRUE(res.first) << res.second; + } +} + +/*************************************************************************************************** + * Pattern 2 (Complex pattern): + * input input + * | | + * Convert Convert + * / | \ / | \ + * / | \ / Add2 \ + * Add1 Add2 \ | | \ | + * | | \ | ---> | | Add3 + * \ | Add3 | | / \ + * \ | / \ ReadValueWithSubgraph Result2 Subgraph(Add1, Add4, Add5) + * \ Add4 \ / \ + * \ | \ Result1 Assign + * Add5 Result2 + * | + * ReadValue + * / \ + * Result1 Assign + * + ***************************************************************************************************/ + +static std::shared_ptr create_const_node(ov::Shape shape) { + return std::make_shared(ov::element::i32, shape, std::vector{1}); +} + +static std::shared_ptr constructRVWithSubGraph2( + ov::NodeVector inputs, + const ov::element::Type& type, + std::shared_ptr variable) { + ov::ParameterVector func_inputs; + for (auto input : inputs) { + auto func_input = + std::make_shared(input->get_element_type(), input->get_output_partial_shape(0)); + func_inputs.push_back(func_input); + } + + auto add1 = std::make_shared(func_inputs[0], create_const_node(ov::Shape{4})); + + auto add4 = std::make_shared(func_inputs[1], func_inputs[2]); + + auto add5 = std::make_shared(add1, add4); + + auto func_output = std::make_shared(add5); + + auto func = std::make_shared(ov::NodeVector({func_output}), func_inputs, "state_init_submodel"); + + auto readvalue = std::make_shared(variable, func); + for (size_t i = 0; i < inputs.size(); i++) { + readvalue->set_input(inputs[i]->output(0), func_inputs[i]); + } + readvalue->set_output(func_output); + readvalue->validate_and_infer_types(); + + return readvalue; +} + +TEST(TransformationTests, ReadValueWithSubgraph_2) { + std::shared_ptr model(nullptr), model_ref(nullptr); + { + const ov::PartialShape shape{1, 2, 4}; + const ov::element::Type in_type = ov::element::f32; + const ov::element::Type out_type = ov::element::i32; + + std::shared_ptr variable = + std::make_shared(ov::op::util::VariableInfo{shape, out_type, "var_id"}); + + { + auto input = std::make_shared(in_type, shape); + input->set_friendly_name("input"); + + auto convert = std::make_shared(input, out_type); + convert->set_friendly_name("convert"); + + auto add1 = std::make_shared(convert, create_const_node(ov::Shape{4})); + add1->set_friendly_name("add1"); + + auto add2 = std::make_shared(convert, create_const_node(ov::Shape{4})); + add2->set_friendly_name("add2"); + + auto add3 = std::make_shared(add2, convert); + add3->set_friendly_name("add3"); + + auto add4 = std::make_shared(add2, add3); + add4->set_friendly_name("add4"); + + auto add5 = std::make_shared(add1, add4); + add5->set_friendly_name("add5"); + + auto readvalue = std::make_shared(add5, variable); + readvalue->set_friendly_name("readvalue"); + + auto assign = std::make_shared(readvalue, variable); + assign->set_friendly_name("assign"); + + auto result1 = std::make_shared(readvalue); + result1->set_friendly_name("result1"); + + auto result2 = std::make_shared(add3); + result2->set_friendly_name("result2"); + + model = std::make_shared(ov::ResultVector{result1, result2}, + ov::SinkVector{assign}, + ov::ParameterVector{input}); + + ov::pass::Manager manager; + manager.register_pass(); + manager.run_passes(model); + } + { + auto input = std::make_shared(in_type, shape); + + auto convert = std::make_shared(input, out_type); + + auto add2 = std::make_shared(convert, create_const_node(ov::Shape{4})); + + auto add3 = std::make_shared(add2, convert); + + auto readvalue = constructRVWithSubGraph2({convert, add2, add3}, out_type, variable); + + auto assign = std::make_shared(readvalue, variable); + + auto result1 = std::make_shared(readvalue); + + auto result2 = std::make_shared(add3); + + model_ref = std::make_shared(ov::ResultVector{result1, result2}, + ov::SinkVector{assign}, + ov::ParameterVector{input}); + } + auto res = compare_functions(model, model_ref, 0, 0, 0, 0, 0, 0); + ASSERT_TRUE(res.first) << res.second; + } +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index c7524f1880157d..0950614897ab43 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -267,6 +267,7 @@ REGISTER_FACTORY(v13, ScaledDotProductAttention); REGISTER_FACTORY(v13, BitwiseAnd); REGISTER_FACTORY(v13, BitwiseOr); REGISTER_FACTORY(v13, BitwiseXor); +REGISTER_FACTORY(v13, FakeConvert); // ------------------------------ Supported v15 ops ----------------------------- // REGISTER_FACTORY(v15, ROIAlignRotated); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/fake_convert.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/fake_convert.hpp new file mode 100644 index 00000000000000..c16af0be51abda --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/fake_convert.hpp @@ -0,0 +1,68 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "primitive.hpp" +#include + +namespace cldnn { + +/// @brief FakeConvert performs element-wise quantization of input values +/// into a set of values corresponding to a target low-precision type. +struct fake_convert : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(fake_convert) + + fake_convert() : primitive_base("", {}) {} + + /// @brief Constructs fake_convert primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param scale Scale primitive id. + /// @param shift Shift primitive id. + /// @param destination_type The low precision type to be emulated. + fake_convert(const primitive_id& id, + const input_info& input, + const input_info& scale, + const input_info& shift, + ov::element::Type destination_type = ov::element::Type_t::f8e4m3) + : primitive_base(id, {input, scale, shift}, 1), destination_type(destination_type) {} + + /// @brief Constructs fake_convert primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param scale Scale primitive id. + /// @param shift Shift primitive id. + /// @param destination_type The low precision type to be emulated. + fake_convert(const primitive_id& id, + const input_info& input, + const input_info& scale, + ov::element::Type destination_type = ov::element::Type_t::f8e4m3) + : primitive_base(id, {input, scale}, 1), destination_type(destination_type) {} + + ov::element::Type destination_type; + + size_t hash() const override { + size_t seed = primitive::hash(); + seed = hash_combine(seed, destination_type.get_type_name()); + return seed; + } + + bool operator==(const primitive& rhs) const override { + if (!compare_common_params(rhs)) + return false; + auto rhs_casted = downcast(rhs); + return (destination_type == rhs_casted.destination_type); + } + + void save(BinaryOutputBuffer& ob) const override { + primitive_base::save(ob); + ob << make_data(&destination_type, sizeof(destination_type)); + } + + void load(BinaryInputBuffer& ib) override { + primitive_base::load(ib); + ib >> make_data(&destination_type, sizeof(destination_type)); + } +}; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/implementation_desc.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/implementation_desc.hpp index e84311a9cfb592..c83b1127e2d44c 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/implementation_desc.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/implementation_desc.hpp @@ -19,6 +19,7 @@ enum class impl_types : uint8_t { ocl = 1 << 2, onednn = 1 << 3, sycl = 1 << 4, + cm = 1 << 5, any = 0xFF, }; @@ -43,6 +44,7 @@ inline std::ostream& operator<<(std::ostream& out, const impl_types& impl_type) case impl_types::common: out << "common"; break; case impl_types::ocl: out << "ocl"; break; case impl_types::onednn: out << "onednn"; break; + case impl_types::cm: out << "cm"; break; case impl_types::any: out << "any"; break; default: out << "unknown"; break; } @@ -61,6 +63,8 @@ inline std::istream& operator>>(std::istream& is, impl_types& impl_type) { impl_type = impl_types::ocl; } else if (str == "onednn") { impl_type = impl_types::onednn; + } else if (str == "cm") { + impl_type = impl_types::cm; } else if (str == "any") { impl_type = impl_types::any; } else { diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp index f87f608597a6bb..2638f2ad60cf26 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp @@ -24,6 +24,10 @@ struct paged_attention : public primitive_base { OPENVINO_ASSERT(inputs.size() == 13, "[GPU] Unexpected inputs number for PagedAttention primitive: ", inputs.size()); } + bool has_scores_output() const { + return num_outputs == 2; + } + bool operator==(const primitive& rhs) const override { return compare_common_params(rhs); } diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp index 09dfcf68f05725..9a26768d0fc068 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp @@ -16,6 +16,11 @@ struct work_group_sizes { std::vector local; }; +enum class kernel_language { + OCLC, + CM, +}; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Scalar //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -122,8 +127,10 @@ struct kernel_string { std::string entry_point; bool batch_compilation; bool has_microkernels; + kernel_language language; - kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false), has_microkernels(false) {} + kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), + batch_compilation(false), has_microkernels(false), language(kernel_language::OCLC) {} std::string get_str() const { return str + jit + undefs + options + entry_point; } size_t get_hash() const { return std::hash()(get_str()); } diff --git a/src/plugins/intel_gpu/src/graph/fake_convert.cpp b/src/plugins/intel_gpu/src/graph/fake_convert.cpp new file mode 100644 index 00000000000000..b201378d52cc8d --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/fake_convert.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fake_convert_inst.h" +#include "fake_convert_shape_inference.hpp" + +#include "primitive_type_base.h" +#include "intel_gpu/runtime/error_handler.hpp" +#include "json_object.h" +#include + +namespace cldnn { +GPU_DEFINE_PRIMITIVE_TYPE_ID(fake_convert) + +layout fake_convert_inst::calc_output_layout(fake_convert_node const& node, kernel_impl_params const& impl_param) { + return calc_output_layouts(node, impl_param)[0]; +} + +template +std::vector fake_convert_inst::calc_output_layouts(fake_convert_node const& node, kernel_impl_params const& impl_param) { + const auto& input_layout = impl_param.get_input_layout(0); + auto output_type = ov::element::Type(input_layout.data_type); + + OPENVINO_ASSERT(ov::element::Type::merge(output_type, output_type, ov::element::Type(impl_param.get_input_layout(1).data_type)), + "Mixed input types are not supported."); + + if (impl_param.input_layouts.size() == 3) { + OPENVINO_ASSERT(ov::element::Type::merge(output_type, output_type, ov::element::Type(impl_param.get_input_layout(2).data_type)), + "Mixed input types are not supported."); + } + + switch (output_type) { + case ov::element::bf16: + case ov::element::f16: + case ov::element::f32: + break; + default: + OPENVINO_THROW("The output data type should be a bf16, f16, f32 but got: ", output_type); + } + + return { layout{input_layout.get_partial_shape(), output_type, input_layout.format} }; +} + +template std::vector fake_convert_inst::calc_output_layouts(fake_convert_node const& node, const kernel_impl_params& impl_param); + +std::string fake_convert_inst::to_string(fake_convert_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + auto& scale = node.scale(); + + std::stringstream primitive_description; + + json_composite fake_convert_info; + fake_convert_info.add("input id", input.id()); + fake_convert_info.add("scale id", scale.id()); + if (node.has_shift()) { + fake_convert_info.add("shift id", node.shift().id()); + } + fake_convert_info.add("destination_type", node.get_destination_type().get_type_name()); + + node_info->add("fake_convert info", fake_convert_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +fake_convert_inst::typed_primitive_inst(network& network, fake_convert_node const& node) + : parent(network, node) {} + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp index 9539117bcf4b18..a40c7dfebb9de6 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "broadcast_inst.h" #include "shape_of_inst.h" #include "read_value_inst.h" #include "reshape_inst.h" @@ -86,6 +87,13 @@ bool mark_shape_of_subgraphs::can_mark_node(const program_node& node) { return false; } + // skip mark_node for broadcast node if dependency nodes are data and shape_of + auto& dependencies = node.get_dependencies(); + if (node.is_type() && dependencies.size() == 2) { + if (dependencies[0].first->is_type() && dependencies[1].first->is_type()) + return false; + } + return true; } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 1e5f943600fc05..ac7810c6e9154c 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -295,6 +295,12 @@ void remove_redundant_reorders::run(program& p) { auto o_layout = r_node.get_output_layout(); const auto& i_layout = r_node.get_input_layout(0); + auto is_r_node_rank_changed = r_node.get_output_layout().get_rank() != r_node.get_dependency(0).get_output_layout().get_rank(); + if (is_r_node_rank_changed && + ((!update_implementations && r_node.get_dependency(0).is_type()) || + (r_node.get_dependency(0).is_type() && r_node.get_dependency(0).can_be_optimized()))) + continue; + // Optimize reorder b_fs_yx_fsv16 -> bfyx when spatials are equal to 1. In this case we can reinterpret buffer, // but pads need to be handled correctly. if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx && !r_node.is_output() && diff --git a/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.cpp b/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.cpp new file mode 100644 index 00000000000000..c4ec8da18c7136 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "impls/cm/impl_example.hpp" + +#include "fully_connected/cm/fully_connected_cm_kernel_selector.h" +#include "fully_connected/fully_connected_params.h" +#include "fully_connected_inst.h" +#include "impls/ocl/primitive_base.hpp" + +namespace cldnn { +namespace cm { + +struct example_impl : ocl::typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + using kernel_selector_t = kernel_selector::fully_connected_cm_kernel_selector; + using kernel_params_t = kernel_selector::fully_connected_params; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cm::example_impl) + + example_impl() = default; + + std::unique_ptr clone() const override { + return make_deep_copy(*this); + } + +protected: + kernel_arguments_data get_arguments(const typed_primitive_inst& instance) const override { + kernel_arguments_data args = parent::get_arguments(instance); + const auto& desc = instance.get_typed_desc(); + + args.weights = instance.weights_memory(); + args.bias = instance.bias_term() ? instance.bias_memory() : nullptr; + + args.inputs = {instance.input_memory_ptr(0)}; + size_t in_id = instance.bias_term() ? 3 : 2; + if (!desc->decompression_scale.empty()) + args.inputs.push_back(instance.dep_memory_ptr(in_id++)); + + if (!desc->decompression_zero_point.empty()) + args.inputs.push_back(instance.dep_memory_ptr(in_id)); + + return args; + } + +public: + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + auto params = get_weights_bias_default_params(impl_param, + false, + is_shape_agnostic); + return params; + } +}; +std::unique_ptr ExampleImplementationManager::create_impl(const program_node& node, + const kernel_impl_params& params) const { + OPENVINO_ASSERT(node.is_type()); + return ocl::typed_primitive_impl_ocl::create( + static_cast(node), + params); +} +} // namespace cm +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cm::example_impl) diff --git a/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.hpp b/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.hpp new file mode 100644 index 00000000000000..0208da12a2f378 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "fully_connected_inst.h" +#include "impls/registry/implementation_manager.hpp" + +namespace cldnn { +namespace cm { + +struct ExampleImplementationManager : public ImplementationManager { + OV_GPU_PRIMITIVE_IMPL("cm::example") + ExampleImplementationManager(shape_types shape_type, ValidateFunc vf = nullptr) + : ImplementationManager(impl_types::cm, shape_type, vf) {} + + std::unique_ptr create_impl(const program_node& node, + const kernel_impl_params& params) const override; + + bool validate_impl(const program_node& node) const override { + assert(node.is_type()); + + auto &engine = node.get_program().get_engine(); + auto &config = node.get_program().get_config(); + if (!check_cm_jit_support(engine, config)) { + return false; + } + + // Example impl should not be chosen unless forced + return false; + } +}; + +} // namespace cm +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp new file mode 100644 index 00000000000000..a5f94741c40bf5 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "impls/cpu/cpu_impl_helpers.hpp" +#include "register.hpp" +#include "fake_convert_inst.h" +#include "impls/registry/implementation_map.hpp" + +#include "openvino/op/fake_convert.hpp" + +namespace cldnn { +namespace cpu { + +struct fake_convert_impl : public typed_primitive_impl { + using parent = typed_primitive_impl; + using parent::parent; + + ov::element::Type destination_type; + + std::shared_ptr op; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cpu::fake_convert_impl) + + std::unique_ptr clone() const override { + return make_unique(*this); + } + + fake_convert_impl() : parent("fake_convert_cpu_impl") {} + + explicit fake_convert_impl(const fake_convert_node& outer) { + set_node_params(outer); + } + + void set_node_params(const program_node& arg) override { + OPENVINO_ASSERT(arg.is_type(), "[GPU] Incorrect program_node type"); + const auto& node = arg.as(); + destination_type = node.get_destination_type(); + } + + void save(BinaryOutputBuffer& ob) const override { + parent::save(ob); + ob << make_data(&destination_type, sizeof(destination_type)); + } + + void load(BinaryInputBuffer& ib) override { + parent::load(ib); + ib >> make_data(&destination_type, sizeof(destination_type)); + } + + event::ptr execute_impl(const std::vector& events, fake_convert_inst& instance) override { + OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "fake_convert::execute_impl"); + auto& stream = instance.get_network().get_stream(); + + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); + + if (!pass_through_events) { + stream.wait_for_events(events); + } + + auto params = instance.get_impl_params(); + + ov::TensorVector input_host_tensors; + ov::TensorVector output_host_tensors; + + if (!op) { + op = std::make_shared(); + op->set_destination_type(destination_type); + } + + std::vector input_mem_ptrs; + for (size_t i = 0; i < instance.dependencies().size(); i++) + input_mem_ptrs.push_back(instance.dep_memory_ptr(i)); + + auto output_mem_ptr = instance.output_memory_ptr(); + + cldnn::mem_lock output_lock(output_mem_ptr, stream); + + for (size_t i = 0; i < input_mem_ptrs.size(); i++) + input_host_tensors.push_back(make_tensor(params->input_layouts[i], input_mem_ptrs[i]->lock(stream, mem_lock_type::read))); + + output_host_tensors.push_back(make_tensor(params->output_layouts[0], output_lock.data())); + + OPENVINO_ASSERT(op->evaluate(output_host_tensors, input_host_tensors), + "[GPU] Couldn't execute fake_convert primitive with id ", instance.id()); + + if (pass_through_events) { + return stream.group_events(events); + } + + return make_output_event(stream, instance.is_output()); + } + + void init_kernels(const kernels_cache& , const kernel_impl_params&) override {} + + void update(primitive_inst& inst, const kernel_impl_params& impl_param) override {} + +public: + static std::unique_ptr create(const fake_convert_node& arg, const kernel_impl_params& impl_param) { + return make_unique(); + } +}; + + +namespace detail { + +attach_fake_convert_impl::attach_fake_convert_impl() { + auto formats = { + format::bfyx, + format::bfzyx, + format::bfwzyx, + format::bfuwzyx, + format::bfvuwzyx, + }; + + auto types = { + data_types::f32, + data_types::f16, + data_types::bf16 + }; + + implementation_map::add(impl_types::cpu, shape_types::static_shape, fake_convert_impl::create, types, formats); + implementation_map::add(impl_types::cpu, shape_types::dynamic_shape, fake_convert_impl::create, types, formats); +} + +} // namespace detail +} // namespace cpu +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cpu::fake_convert_impl) +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::fake_convert) diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp index 2b0dc5b212158c..e86628444de439 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp @@ -31,6 +31,7 @@ void register_implementations() { REGISTER_CPU(tile); REGISTER_CPU(select); REGISTER_CPU(reduce); + REGISTER_CPU(fake_convert); } } // namespace cpu diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp index cb89eae29d8c56..15cc4b11c077eb 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp @@ -56,6 +56,7 @@ REGISTER_CPU(broadcast); REGISTER_CPU(tile); REGISTER_CPU(select); REGISTER_CPU(reduce); +REGISTER_CPU(fake_convert); #undef REGISTER_CPU diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 42d83a0265d290..7d54129195ccc6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -121,6 +121,46 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) { namespace cldnn { +bool check_cm_jit_support(cldnn::engine& e, const cldnn::ExecutionConfig& config) { + auto device = e.get_device().get(); + + static std::mutex m; + std::lock_guard lock(m); + + static std::map cache; + if (cache.find(device) != cache.end()) { + return cache.at(device); + } + + std::shared_ptr kernel_string = std::make_shared(); + // This program checks if cm sources can be jitted by current IGC version + const char* kernel_code = R""""( + #include + #include + + extern "C" _GENX_MAIN_ void cm_check() { + unsigned int id = cm_linear_global_id(); + } + )""""; + + kernel_string->str = kernel_code; + kernel_string->options = " -cmc "; + kernel_string->entry_point = "cm_check"; + kernel_string->batch_compilation = true; + + try { + cldnn::kernel_impl_params dummy_params; + auto _kernels_cache_device_query = std::unique_ptr(new cldnn::kernels_cache(e, config, 0)); + _kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false); + _kernels_cache_device_query->build_all(); + cache[device] = true; + } catch (std::exception&) { + cache[device] = false; + } + + return cache.at(device); +} + bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config) { auto device = e.get_device().get(); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h index a8c715af98f198..bf8968fd4b255b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h @@ -306,6 +306,7 @@ inline void update_shapes(kernel_selector::Params& p, const kernel_impl_params& } } +bool check_cm_jit_support(cldnn::engine& e, const cldnn::ExecutionConfig& config); bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config); } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp index 5db452dcda26f0..b122195c8e1265 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp @@ -153,8 +153,12 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code, std::string entry_point = kernel_string->entry_point; std::string options = kernel_string->options; bool batch_compilation = kernel_string->batch_compilation; + bool is_cm = kernel_string->language == kernel_language::CM; - if (batch_compilation) { + auto& headers = is_cm ? cm_batch_headers : batch_headers; + + // Order matters for cm options + if (batch_compilation && !is_cm) { options = reorder_options(options); } @@ -174,7 +178,7 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code, const auto& batch_id = 0; // increase bucket id if and only if new bucket comes bucket_id = static_cast(program_buckets.size() - 1); - current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_headers)); + current_bucket.push_back(batch_program(bucket_id, batch_id, options, headers, is_cm)); } // This is a temporary walk-around to avoid severe performance drop. @@ -205,7 +209,7 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code, || current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end() || need_separate_batch(entry_point)) { const auto& batch_id = static_cast(current_bucket.size()); - current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_headers)); + current_bucket.push_back(batch_program(bucket_id, batch_id, options, headers, is_cm)); } auto& current_batch = current_bucket.back(); @@ -270,12 +274,14 @@ kernels_cache::kernels_cache(engine& engine, const ExecutionConfig& config, uint32_t prog_id, std::shared_ptr task_executor, - const std::map& batch_headers) + const std::map& batch_headers, + const std::map& cm_batch_headers) : _device(get_target_device(engine)) , _task_executor(task_executor) , _config(config) , _prog_id(prog_id) - , batch_headers(std::move(batch_headers)) { } + , batch_headers(std::move(batch_headers)) + , cm_batch_headers(std::move(cm_batch_headers)) { } static std::vector getProgramBinaries(cl::Program program) { // Get the size of the program binary in bytes. diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp index b08b087c55854a..1bb0ffbd2066bb 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp @@ -58,7 +58,11 @@ class kernels_cache { bool has_microkernels; std::map> entry_point_to_id; - explicit batch_program(int32_t _bucket_id, int32_t _batch_id, std::string _options, const std::map& batch_headers) + explicit batch_program(int32_t _bucket_id, + int32_t _batch_id, + std::string _options, + const std::map& batch_headers, + bool is_cm = false) : bucket_id(_bucket_id), batch_id(_batch_id), hash_value(0), @@ -68,17 +72,22 @@ class kernels_cache { dump_custom_program(false), has_microkernels(false), entry_point_to_id({}) { - static const std::vector micro_kernel_include_names { - "generic_vector_ops", - "tile_ops", - "sdpa_utils" - }; - for (const auto& kv : batch_headers) { - if (std::find(micro_kernel_include_names.begin(), micro_kernel_include_names.end(), kv.first) == micro_kernel_include_names.end()) { - source.push_back(kv.second); - } else { - micro_headers.push_back(kv.second); + if (!is_cm) { + static const std::vector micro_kernel_include_names { + "generic_vector_ops", + "tile_ops", + "sdpa_utils" + }; + for (const auto& kv : batch_headers) { + if (std::find(micro_kernel_include_names.begin(), micro_kernel_include_names.end(), kv.first) == micro_kernel_include_names.end()) { + source.push_back(kv.second); + } else { + micro_headers.push_back(kv.second); + } } + } else { + for (const auto& kv : batch_headers) + source.push_back(kv.second); } } }; @@ -97,6 +106,7 @@ class kernels_cache { std::map, uint32_t> _cached_binaries; std::unordered_map _cached_kernels; std::map batch_headers; + std::map cm_batch_headers; std::unordered_map _kernel_batch_hash; void get_program_source(const kernels_code& kernels_source_code, std::vector*) const; void build_batch(const batch_program& batch, compiled_kernels& compiled_kernels); @@ -112,7 +122,8 @@ class kernels_cache { const ExecutionConfig& config, uint32_t prog_id, std::shared_ptr task_executor = nullptr, - const std::map& batch_headers = {}); + const std::map& batch_headers = {}, + const std::map& cm_batch_headers = {}); kernel::ptr get_kernel_from_cached_kernels(std::string id) const; std::vector get_kernels(const kernel_impl_params& params) const; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp index 9cf1a252564934..2bc377f2c1459a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp @@ -63,6 +63,7 @@ struct paged_attention_impl : multi_stage_primitive { void load(BinaryInputBuffer& ib) override { parent::load(ib); + ib >> make_data(&has_scores_output, sizeof(bool)); if (is_dynamic()) { auto& kv_cache_update_kernel_selector = kv_cache_update_kernel_selector_t::Instance(); auto kv_cache_update_kernel_impl = kv_cache_update_kernel_selector.GetImplementation(_kernels_data[Stage::KV_CACHE_UPDATE].kernelName); @@ -78,7 +79,45 @@ struct paged_attention_impl : multi_stage_primitive { } } + void save(BinaryOutputBuffer& ob) const override { + parent::save(ob); + ob << make_data(&has_scores_output, sizeof(bool)); + } + std::vector get_internal_buffer_layouts_impl() const override { + /* + * Internal buffers allocation owners and users: + * +--------------------------------------+--------------------+--------------------+ + * | Stage | Allocates & uses | Reuses | + * +--------------------------------------+--------------------+--------------------+ + * | KV_CACHE_UPDATE | [0, 1, 2] | | + * +--------------------------------------+--------------------+--------------------+ + * | SDPA (1st token) | | [0, 1, 2] | + * +--------------------------------------+--------------------+--------------------+ + * | PA_SDPA (2nd+ token) | [5, 6, 7] | | + * +--------------------------------------+--------------------+--------------------+ + * | PA_SDPA (mixed mode) | [5, 6, 7, 8] | | + * +--------------------------------------+--------------------+--------------------+ + * | SDPA (1st token) + scores output | | [0, 1, 2, 3, 4] | + * +--------------------------------------+--------------------+--------------------+ + * | PA_SDPA (2nd+ token) + scores output | [3, 4, 5, 6, 7] | | + * +--------------------------------------+--------------------+--------------------+ + * | PA_SDPA (mixed mode) + scores output | [3, 4, 5, 6, 7, 8] | | + * +--------------------------------------+--------------------+--------------------+ + * + * Description: + * 0, 1, 2 - Buffers used for proper blocks distribution for kv_cache_update and + * sdpa_opt (1st token calculation) block configuration over target_seq_len dimension. + * Filled in paged_attention_inst::on_execute() call. + * 3, 4 - Optional buffers used for PA scores output calculation, storing intermediate + * softmax values by partitions (filled in PA/SDPA kernels) and sequence length offsets + * for each subsequence (filled in paged_attention_inst::on_execute() call). + * 5, 6, 7 - Used for 2nd+ PA calculation (for softmax exp_sums, max_logits, and intermediate output). + * Filled in PA/SDPA kernels. + * 8 - Optional buffer used for mixed PA execution mode, mapping gws idx to subsequence id. + * Filled in paged_attention_inst::on_execute() call. + */ + auto add_internal_buffers = [](std::vector& layouts, const kernel_selector::KernelData& kd) { if (kd.internalBufferSizes.empty()) return; @@ -133,6 +172,7 @@ struct paged_attention_impl : multi_stage_primitive { args.outputs = { instance.output_memory_ptr(0) }; } else if (stage == Stage::PA_SDPA) { if (kernel_idx == 0 || kernel_idx == 1) { + // 2nd+ token calculation or mixed stage tokens calculation args.shape_info = instance.shape_info_memory_ptr(); args.inputs = { instance.input_memory_ptr(0), @@ -155,7 +195,8 @@ struct paged_attention_impl : multi_stage_primitive { if (desc->has_alibi) { args.inputs.push_back(instance.alibi_memory_ptr()); } - } else { + } else if (kernel_idx == 2 || kernel_idx == 3) { + // Finalization kernel or mixed stage finalization kernel args.inputs = { instance.past_lens_memory_ptr() }; if (is_mixed_mode) { @@ -163,17 +204,31 @@ struct paged_attention_impl : multi_stage_primitive { // dependency args.inputs.push_back(instance.subsequence_begins_memory_ptr()); } + } else if (kernel_idx == 4) { + // Output scores calculation kernel + args.inputs = { instance.past_lens_memory_ptr(), + instance.subsequence_begins_memory_ptr() }; } args.outputs = { instance.output_memory_ptr(0) }; + + if (kernel_idx == 4) { + args.outputs.push_back(instance.output_memory_ptr(1)); + } } return args; } std::set get_lockable_internal_buffers() const override { - return std::set{ 0, 1, 2, /* SDPA and KV_CACHE_UPDATE indexes configuration */ - 6, /* PA_SDPA multiple tokens mode */ }; + size_t mixed_mode_buffer = has_scores_output ? 8 : 6; + + std::set lockable_ids = { 0, 1, 2, /* SDPA and KV_CACHE_UPDATE indexes configuration */ + mixed_mode_buffer /* PA_SDPA multiple tokens mode */ }; + if (has_scores_output) + lockable_ids.insert(4 /* Precalculated accumulated sequence length offsets for each subsequence */); + + return lockable_ids; }; void execute_stage(const std::vector& events, @@ -194,8 +249,17 @@ struct paged_attention_impl : multi_stage_primitive { if (stage == Stage::PA_SDPA) { internal_buffers_offset = _kernels_data[Stage::KV_CACHE_UPDATE].internalBufferSizes.size(); internal_buffers_count = _kernels_data[Stage::PA_SDPA].internalBufferSizes.size(); - } else { + } else if (stage == Stage::KV_CACHE_UPDATE) { + internal_buffers_count = _kernels_data[Stage::KV_CACHE_UPDATE].internalBufferSizes.size(); + } else if (stage == Stage::SDPA) { internal_buffers_count = _kernels_data[Stage::KV_CACHE_UPDATE].internalBufferSizes.size(); + + const auto desc = instance.get_node().as().get_primitive(); + if (desc->has_scores_output()) { + // Add intermediate buffers for PagedAttention scores calculation: + // softmax_results, subsequence_offsets, exp_sums, max_logits, tmp_out + internal_buffers_count += 5; + } } for (size_t kd_idx = 0; kd_idx < _kernels_data[stage].kernels.size(); ++kd_idx) { @@ -216,6 +280,23 @@ struct paged_attention_impl : multi_stage_primitive { intermediate_memories.begin() + internal_buffers_offset, intermediate_memories.begin() + internal_buffers_offset + internal_buffers_count); + GPU_DEBUG_TRACE_DETAIL << "Execute stage=" << stage << " kernel=" << kd_idx << " " << _kernels_data[stage].kernelName << " start_offset=" + << internal_buffers_offset << " count=" << internal_buffers_count << "\n"; + + GPU_DEBUG_TRACE_DETAIL << "Configured kernel arguments:\n"; + for (size_t i = 0; i < _kernels_data[stage].kernels[kd_idx].params.arguments.size(); i++) { + GPU_DEBUG_TRACE_DETAIL << "\t" << i << ": type=" << static_cast(_kernels_data[stage].kernels[kd_idx].params.arguments[i].t) << " " + << "index=" << _kernels_data[stage].kernels[kd_idx].params.arguments[i].index << "\n"; + } + + GPU_DEBUG_TRACE_DETAIL << "Memory buffers:" + << "shape_info=" << args.shape_info << " " + << "inputs=" << args.inputs.size() << " " + << "outputs=" << args.outputs.size() << " " + << "intermediates=" << args.intermediates.size() << " " + << "weights=" << args.weights << " " + << "scalars=" << (args.scalars ? args.scalars->size() : 0) << "\n"; + stream.set_arguments(*_kernels[idx_final], _kernels_data[stage].kernels[kd_idx].params, args); const auto& gws = params.workGroups.global; @@ -242,10 +323,13 @@ struct paged_attention_impl : multi_stage_primitive { execute_stage(events, instance, res_events, Stage::KV_CACHE_UPDATE, is_mixed_mode); - std::vector dep_events(res_events.begin(), res_events.end()); if (stage == PagedAttentionStage::PREFILL) { + std::vector dep_events(res_events.begin(), res_events.end()); execute_stage(dep_events, instance, res_events, Stage::SDPA, is_mixed_mode); - } else if (stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED) { + } + + if (stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED || has_scores_output) { + std::vector dep_events(res_events.begin(), res_events.end()); execute_stage(dep_events, instance, res_events, Stage::PA_SDPA, is_mixed_mode); } @@ -338,7 +422,7 @@ struct paged_attention_impl : multi_stage_primitive { return aligned_seq_len; } - static kernel_selector::sdpa_configuration get_sdpa_configuration(const kernel_impl_params& impl_param) { + static kernel_selector::sdpa_configuration get_sdpa_configuration(const kernel_impl_params& impl_param, bool is_dynamic = true) { kernel_selector::sdpa_configuration config; const auto desc = impl_param.typed_desc(); @@ -362,37 +446,45 @@ struct paged_attention_impl : multi_stage_primitive { config.group_size = desc->heads_num / desc->kv_heads_num; } + if (desc->has_scores_output() && !is_dynamic) { + const auto& input_mem = impl_param.memory_deps; + const auto max_context_len = input_mem.at(12); + mem_lock max_context_len_mem_lock(max_context_len, *impl_param.strm); + config.paged_attention_max_len = max_context_len_mem_lock[0]; + } + return config; } static kv_cache_update_kernel_params_t get_kv_cache_update_kernel_params(const kernel_impl_params& impl_param, const PagedAttentionStage& stage, + const kernel_selector::MultiDataTensor& input_tensors, bool is_dynamic = false) { auto params = get_default_params(impl_param, is_dynamic); - const auto& key_layout = impl_param.get_input_layout(1); - const auto& value_layout = impl_param.get_input_layout(2); - const auto& key_cache_layout = impl_param.get_input_layout(3); - const auto& value_cache_layout = impl_param.get_input_layout(4); - const auto& past_lens_layout = impl_param.get_input_layout(5); - const auto& block_indices_layout = impl_param.get_input_layout(7); - const auto& block_indices_begins_layout = impl_param.get_input_layout(8); - const auto& subsequence_begins_layout = impl_param.get_input_layout(6); + const auto& key_tensor = input_tensors[1]; + const auto& value_tensor = input_tensors[2]; + const auto& key_cache_tensor = input_tensors[3]; + const auto& value_cache_tensor = input_tensors[4]; + const auto& past_lens_tensor = input_tensors[5]; + const auto& block_indices_tensor = input_tensors[7]; + const auto& block_indices_begins_tensor = input_tensors[8]; + const auto& subsequence_begins_tensor = input_tensors[6]; const auto inputs_number = 6; const auto outputs_number = 2; params.inputs.resize(inputs_number); params.outputs.resize(outputs_number); - params.inputs[0] = convert_data_tensor(key_layout); - params.inputs[1] = convert_data_tensor(value_layout); - params.inputs[2] = convert_data_tensor(past_lens_layout); - params.inputs[3] = convert_data_tensor(block_indices_layout); - params.inputs[4] = convert_data_tensor(block_indices_begins_layout); - params.inputs[5] = convert_data_tensor(subsequence_begins_layout); - params.outputs[0] = convert_data_tensor(key_cache_layout); - params.outputs[1] = convert_data_tensor(value_cache_layout); + params.inputs[0] = key_tensor; + params.inputs[1] = value_tensor; + params.inputs[2] = past_lens_tensor; + params.inputs[3] = block_indices_tensor; + params.inputs[4] = block_indices_begins_tensor; + params.inputs[5] = subsequence_begins_tensor; + params.outputs[0] = key_cache_tensor; + params.outputs[1] = value_cache_tensor; - params.conf = get_sdpa_configuration(impl_param); + params.conf = get_sdpa_configuration(impl_param, is_dynamic); params.is_prefill = stage == PagedAttentionStage::PREFILL || stage == PagedAttentionStage::MIXED; @@ -418,18 +510,23 @@ struct paged_attention_impl : multi_stage_primitive { return params; } - static sdpa_kernel_params_t get_sdpa_kernel_params(const kernel_impl_params& impl_param, const PagedAttentionStage& stage, bool is_dynamic = false) { + static sdpa_kernel_params_t get_sdpa_kernel_params(const kernel_impl_params& impl_param, + const PagedAttentionStage& stage, + const kernel_selector::MultiDataTensor& input_tensors, + bool is_dynamic = false) { const auto desc = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_dynamic); - const auto& query_layout = impl_param.get_input_layout(0); - const auto& key_layout = impl_param.get_input_layout(1); - const auto& value_layout = impl_param.get_input_layout(2); - const auto& subsequence_begins_layout = impl_param.get_input_layout(6); - const auto& scale_layout = impl_param.get_input_layout(9); - const auto& alibi_layout = impl_param.get_input_layout(11); - const auto has_alibi = alibi_layout.count() > 0; + const auto& query_tensor = input_tensors[0]; + const auto& key_tensor = input_tensors[1]; + const auto& value_tensor = input_tensors[2]; + const auto& subsequence_begins_tensor = input_tensors[6]; + const auto& scale_tensor = input_tensors[9]; + const auto& alibi_tensor = input_tensors[11]; + + const auto has_alibi = impl_param.get_input_layout(11).count() > 0; const auto has_scale_input = !desc->scale_val.has_value(); + const auto has_scores_output = desc->has_scores_output(); auto inputs_number = 4; if (has_scale_input) @@ -440,18 +537,23 @@ struct paged_attention_impl : multi_stage_primitive { auto input_idx = 0; params.inputs.resize(inputs_number); - params.inputs[input_idx++] = convert_data_tensor(query_layout); - params.inputs[input_idx++] = convert_data_tensor(key_layout); - params.inputs[input_idx++] = convert_data_tensor(value_layout); - params.inputs[input_idx++] = convert_data_tensor(subsequence_begins_layout); + params.inputs[input_idx++] = query_tensor; + params.inputs[input_idx++] = key_tensor; + params.inputs[input_idx++] = value_tensor; + params.inputs[input_idx++] = subsequence_begins_tensor; if (has_scale_input) - params.inputs[input_idx++] = convert_data_tensor(scale_layout); + params.inputs[input_idx++] = scale_tensor; if (has_alibi) - params.inputs[input_idx++] = convert_data_tensor(alibi_layout); + params.inputs[input_idx++] = alibi_tensor; - params.conf = get_sdpa_configuration(impl_param); + if (has_scores_output) { + params.outputs.resize(2); + params.outputs[1] = convert_data_tensor(impl_param.get_output_layout(1)); + } + + params.conf = get_sdpa_configuration(impl_param, is_dynamic); const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; @@ -475,26 +577,34 @@ struct paged_attention_impl : multi_stage_primitive { if ((stage == PagedAttentionStage::PREFILL || stage == PagedAttentionStage::MIXED) && !is_dynamic) params.conf.paged_attention_aligned_seq_len = get_aligned_seq_len(impl_param, stage); + if (has_scores_output) + out_tensor_to_offset_map.insert({1, out_offsets_map.at(1)}); + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); return params; } - static pa_sdpa_kernel_params_t get_pa_sdpa_params(const kernel_impl_params& impl_param, const PagedAttentionStage& stage, bool is_dynamic = false) { + static pa_sdpa_kernel_params_t get_pa_sdpa_params(const kernel_impl_params& impl_param, + const PagedAttentionStage& stage, + const kernel_selector::MultiDataTensor& input_tensors, + bool is_dynamic = false) { const auto desc = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_dynamic); - const auto& query_layout = impl_param.get_input_layout(0); - const auto& key_cache_layout = impl_param.get_input_layout(3); - const auto& value_cache_layout = impl_param.get_input_layout(4); - const auto& past_lens_layout = impl_param.get_input_layout(5); - const auto& block_indices_layout = impl_param.get_input_layout(7); - const auto& block_indices_begins_layout = impl_param.get_input_layout(8); - const auto& subsequence_begins_layout = impl_param.get_input_layout(6); - const auto& scale_layout = impl_param.get_input_layout(9); - const auto& alibi_layout = impl_param.get_input_layout(11); - const auto has_alibi = alibi_layout.count() > 0; + const auto& query_tensor = input_tensors[0]; + const auto& key_cache_tensor = input_tensors[3]; + const auto& value_cache_tensor = input_tensors[4]; + const auto& past_lens_tensor = input_tensors[5]; + const auto& block_indices_tensor = input_tensors[7]; + const auto& block_indices_begins_tensor = input_tensors[8]; + const auto& subsequence_begins_tensor = input_tensors[6]; + const auto& scale_tensor = input_tensors[9]; + const auto& alibi_tensor = input_tensors[11]; + + const auto has_alibi = impl_param.get_input_layout(11).count() > 0; const auto has_scale_input = !desc->scale_val.has_value(); + const auto has_scores_output = desc->has_scores_output(); auto inputs_number = 7; if (has_scale_input) @@ -505,28 +615,34 @@ struct paged_attention_impl : multi_stage_primitive { auto input_idx = 0; params.inputs.resize(inputs_number); - params.inputs[input_idx++] = convert_data_tensor(query_layout); - params.inputs[input_idx++] = convert_data_tensor(key_cache_layout); - params.inputs[input_idx++] = convert_data_tensor(value_cache_layout); - params.inputs[input_idx++] = convert_data_tensor(past_lens_layout); - params.inputs[input_idx++] = convert_data_tensor(block_indices_layout); - params.inputs[input_idx++] = convert_data_tensor(block_indices_begins_layout); - params.inputs[input_idx++] = convert_data_tensor(subsequence_begins_layout); - params.conf = get_sdpa_configuration(impl_param); + params.inputs[input_idx++] = query_tensor; + params.inputs[input_idx++] = key_cache_tensor; + params.inputs[input_idx++] = value_cache_tensor; + params.inputs[input_idx++] = past_lens_tensor; + params.inputs[input_idx++] = block_indices_tensor; + params.inputs[input_idx++] = block_indices_begins_tensor; + params.inputs[input_idx++] = subsequence_begins_tensor; + + params.conf = get_sdpa_configuration(impl_param, is_dynamic); if (has_scale_input) - params.inputs[input_idx++] = convert_data_tensor(scale_layout); + params.inputs[input_idx++] = scale_tensor; if (has_alibi) - params.inputs[input_idx++] = convert_data_tensor(alibi_layout); + params.inputs[input_idx++] = alibi_tensor; - params.multi_tokens_mode = stage == PagedAttentionStage::MIXED; + if (has_scores_output) { + params.outputs.resize(2); + params.outputs[1] = convert_data_tensor(impl_param.get_output_layout(1)); + } - if ((stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED) && !is_dynamic) { + params.stage = stage; + + if (!has_scores_output && !is_dynamic) { const auto& input_mem = impl_param.memory_deps; const auto max_context_len = input_mem.at(12); mem_lock max_context_len_mem_lock(max_context_len, *impl_param.strm); - params.max_context_len = max_context_len_mem_lock[0]; + params.conf.paged_attention_max_len = max_context_len_mem_lock[0]; } const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; @@ -552,6 +668,9 @@ struct paged_attention_impl : multi_stage_primitive { if (has_alibi) in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(11)}); + if (has_scores_output) + out_tensor_to_offset_map.insert({1, out_offsets_map.at(1)}); + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); return params; @@ -560,14 +679,20 @@ struct paged_attention_impl : multi_stage_primitive { void update_dispatch_data(const kernel_impl_params& impl_param) override { const auto stage = get_paged_attention_stage(impl_param); - auto kv_cache_update_kernel_params = get_kv_cache_update_kernel_params(impl_param, stage, impl_param.is_dynamic()); + kernel_selector::MultiDataTensor input_tensors; + for (const auto& input_layout : impl_param.input_layouts) + input_tensors.emplace_back(convert_data_tensor(input_layout)); + + auto kv_cache_update_kernel_params = get_kv_cache_update_kernel_params(impl_param, stage, input_tensors, impl_param.is_dynamic()); (_kernels_data[Stage::KV_CACHE_UPDATE].update_dispatch_data_func)(kv_cache_update_kernel_params, _kernels_data[Stage::KV_CACHE_UPDATE]); if (stage == PagedAttentionStage::PREFILL) { - auto sdpa_kernel_params = get_sdpa_kernel_params(impl_param, stage, impl_param.is_dynamic()); + auto sdpa_kernel_params = get_sdpa_kernel_params(impl_param, stage, input_tensors, impl_param.is_dynamic()); (_kernels_data[Stage::SDPA].update_dispatch_data_func)(sdpa_kernel_params, _kernels_data[Stage::SDPA]); - } else if (stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED) { - auto pa_sdpa_kernel_params = get_pa_sdpa_params(impl_param, stage, impl_param.is_dynamic()); + } + + if (stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED || has_scores_output) { + auto pa_sdpa_kernel_params = get_pa_sdpa_params(impl_param, stage, input_tensors, impl_param.is_dynamic()); (_kernels_data[Stage::PA_SDPA].update_dispatch_data_func)(pa_sdpa_kernel_params, _kernels_data[Stage::PA_SDPA]); } } @@ -576,20 +701,32 @@ struct paged_attention_impl : multi_stage_primitive { std::vector kernels_data; const auto stage = PagedAttentionStage::UNKNOWN; - auto kv_cache_update_kernel_params = get_kv_cache_update_kernel_params(impl_param, stage, impl_param.is_dynamic()); + kernel_selector::MultiDataTensor input_tensors; + for (const auto& input_layout : impl_param.input_layouts) + input_tensors.emplace_back(convert_data_tensor(input_layout)); + + auto kv_cache_update_kernel_params = get_kv_cache_update_kernel_params(impl_param, stage, input_tensors, impl_param.is_dynamic()); auto& kv_cache_update_kernel_selector = kv_cache_update_kernel_selector_t::Instance(); kernels_data.push_back(kv_cache_update_kernel_selector.get_best_kernel(kv_cache_update_kernel_params)); - auto sdpa_kernel_params = get_sdpa_kernel_params(impl_param, stage, impl_param.is_dynamic()); + auto sdpa_kernel_params = get_sdpa_kernel_params(impl_param, stage, input_tensors, impl_param.is_dynamic()); auto& sdpa_kernel_selector = sdpa_kernel_selector_t::Instance(); kernels_data.push_back(sdpa_kernel_selector.get_best_kernel(sdpa_kernel_params)); - auto pa_sdpa_kernel_params = get_pa_sdpa_params(impl_param, stage, impl_param.is_dynamic()); + auto pa_sdpa_kernel_params = get_pa_sdpa_params(impl_param, stage, input_tensors, impl_param.is_dynamic()); auto& pa_sdpa_kernel_selector = pa_sdpa_kernel_selector_t::Instance(); kernels_data.push_back(pa_sdpa_kernel_selector.get_best_kernel(pa_sdpa_kernel_params)); - return cldnn::make_unique(kernels_data); + auto impl = cldnn::make_unique(kernels_data); + + const auto& desc = impl_param.typed_desc(); + impl->has_scores_output = desc->has_scores_output(); + + return impl; } + +private: + bool has_scores_output = false; }; namespace detail { diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/fake_convert_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/fake_convert_impls.cpp new file mode 100644 index 00000000000000..991ab5aa12657a --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/registry/fake_convert_impls.cpp @@ -0,0 +1,24 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "registry.hpp" +#include "intel_gpu/primitives/fake_convert.hpp" +#include "primitive_inst.h" + +namespace ov { +namespace intel_gpu { + +using namespace cldnn; + +const std::vector>& Registry::get_implementations() { + static const std::vector> impls = { + OV_GPU_GET_INSTANCE_CPU(fake_convert, shape_types::static_shape) + OV_GPU_GET_INSTANCE_CPU(fake_convert, shape_types::dynamic_shape) + }; + + return impls; +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/fully_connected_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/fully_connected_impls.cpp index 6f725150794fb6..6ea9eb33c7421c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/registry/fully_connected_impls.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/registry/fully_connected_impls.cpp @@ -11,6 +11,10 @@ #include "impls/onednn/fully_connected_onednn.hpp" #endif +#if OV_GPU_WITH_CM + #include "impls/cm/impl_example.hpp" +#endif + namespace ov { namespace intel_gpu { @@ -26,6 +30,7 @@ const std::vector>& Registry +#include + +namespace cldnn { + +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program& prog) + : parent(prim, prog), destination_type(prim->destination_type) { + support_padding_all(true); + } + +public: + using parent::parent; + + program_node& input() const { return get_dependency(0); } + program_node& scale() const { return get_dependency(1); } + program_node& shift() const { return get_dependency(2); } + bool has_shift() const { return (get_dependencies().size() == 3); } + + ov::element::Type get_destination_type() const { return destination_type; } + + std::vector get_shape_infer_dependencies() const override { return {}; } + +private: + ov::element::Type destination_type; +}; + +using fake_convert_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + template + static std::vector calc_output_layouts(fake_convert_node const& /*node*/, const kernel_impl_params& impl_param); + static layout calc_output_layout(fake_convert_node const& node, kernel_impl_params const& impl_param); + static std::string to_string(fake_convert_node const& node); + + typed_primitive_inst(network& network, fake_convert_node const& node); +}; + +using fake_convert_inst = typed_primitive_inst; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h b/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h index a7918ba9c3719c..675d77296aa06b 100644 --- a/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h @@ -7,14 +7,11 @@ #include "intel_gpu/primitives/paged_attention.hpp" #include "primitive_inst.h" +#include "sdpa/pa_sdpa_kernel_opt.h" + namespace cldnn { -enum PagedAttentionStage { - GENERATE = 0, - PREFILL = 1, - MIXED = 2, - UNKNOWN = 3 -}; +using PagedAttentionStage = kernel_selector::PagedAttentionStage; PagedAttentionStage get_paged_attention_stage(const kernel_impl_params& impl_param); @@ -61,6 +58,9 @@ class typed_primitive_inst : public typed_primitive_inst_base

prefill_network; diff --git a/src/plugins/intel_gpu/src/graph/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/paged_attention.cpp index 787fd184f75b6a..c761aaf63799cd 100644 --- a/src/plugins/intel_gpu/src/graph/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/paged_attention.cpp @@ -48,14 +48,38 @@ layout paged_attention_inst::calc_output_layout(const paged_attention_node& /*no template std::vector paged_attention_inst::calc_output_layouts(paged_attention_node const& /*node*/, kernel_impl_params const& impl_param) { - auto out_layout = impl_param.get_input_layout(0); + auto data_layout = impl_param.get_input_layout(0); const auto& key_cache_ps = impl_param.get_input_layout(3).get_partial_shape(); bool valid_block_size = key_cache_ps[3].is_dynamic() || key_cache_ps[3].get_length() == paged_attention::block_size; OPENVINO_ASSERT(valid_block_size, "[GPU] Incorrect block size for Paged Attention operation. " "Expected ", paged_attention::block_size, ", but got ", key_cache_ps[3].get_length()); - return {out_layout}; + std::vector output_layouts{ data_layout }; + + const auto& desc = impl_param.typed_desc(); + if (desc->has_scores_output()) { + const auto past_lens_idx = 5; + const auto output_dt = data_layout.data_type; + if (impl_param.get_input_layout(past_lens_idx).is_static()) { + const auto& memory_deps = impl_param.memory_deps; + const auto past_lens_mem = memory_deps.at(past_lens_idx); + mem_lock past_lens_mem_lock(past_lens_mem, *impl_param.strm); + + long int total_size = 0; + for (size_t i = 0; i < past_lens_mem_lock.size(); i++) { + total_size += past_lens_mem_lock[i]; + } + + total_size += static_cast(impl_param.get_input_layout(0).get_shape()[0]); + + output_layouts.push_back(layout{ov::PartialShape{total_size}, output_dt, format::bfyx}); + } else { + output_layouts.push_back(layout{ov::PartialShape::dynamic(1), output_dt, format::bfyx}); + } + } + + return output_layouts; } template std::vector @@ -81,45 +105,79 @@ std::string paged_attention_inst::to_string(const paged_attention_node& node) { } void paged_attention_inst::on_execute() { - auto stage = get_paged_attention_stage(*_impl_params); + const auto& desc = _impl_params->typed_desc(); + const bool has_scores_output = desc->has_scores_output(); + const auto stage = get_paged_attention_stage(*_impl_params); - if (stage == PagedAttentionStage::UNKNOWN || - stage == PagedAttentionStage::GENERATE) + if ((stage == PagedAttentionStage::UNKNOWN) || + (stage == PagedAttentionStage::GENERATE && !has_scores_output)) return; + auto& stream = get_network().get_stream(); + const auto past_lens_mem = past_lens_memory_ptr(); + const auto subsequence_begins_mem = subsequence_begins_memory_ptr(); + mem_lock past_lens_mem_lock(past_lens_mem, stream); + mem_lock subsequence_begins_mem_lock(subsequence_begins_mem, stream); + std::unique_ptr> subsequence_offsets_lock = nullptr; + + if (has_scores_output) { + const size_t subsequence_offsets_idx = 4; + + OPENVINO_ASSERT(_intermediates_memory.size() > subsequence_offsets_idx, + "[GPU] Unexpected number of intermediates buffers for Paged Attention for scores output calculation"); + + auto subsequence_offsets_mem = _intermediates_memory[subsequence_offsets_idx]; + subsequence_offsets_lock.reset(new mem_lock(subsequence_offsets_mem, stream)); + } + + if (stage == PagedAttentionStage::GENERATE) { + // For the generate stage it's not necessary to configure any other intermediate + // buffers. Simply calculate the offsets and exit + size_t subsequence_offsets_acc = 0; + for (size_t i = 0; i < subsequence_begins_mem_lock.size() - 1; i++) { + const auto past_len = past_lens_mem_lock[i]; + const auto seq_start = subsequence_begins_mem_lock[i]; + const auto seq_end = subsequence_begins_mem_lock[i + 1]; + const auto seq_length = seq_end - seq_start; + + if (subsequence_offsets_lock) { + subsequence_offsets_lock->operator[](i) = static_cast(subsequence_offsets_acc); + subsequence_offsets_acc += seq_length + past_len; + } + } + + return; + } + OPENVINO_ASSERT(_intermediates_memory.size() >= 3, "Unexpected number of intermediates buffers for Paged Attention at prefill stage"); const auto blocks_indexes_start_idx = 0; const auto blocks_indexes_end_idx = 1; const auto blocked_gws_subseq_mapping_idx = 2; - const auto past_lens_mem = past_lens_memory_ptr(); - auto subsequence_begins_mem = subsequence_begins_memory_ptr(); auto blocks_indexes_start_mem = _intermediates_memory[blocks_indexes_start_idx]; auto blocks_indexes_end_mem = _intermediates_memory[blocks_indexes_end_idx]; auto blocked_gws_subseq_mapping_mem = _intermediates_memory[blocked_gws_subseq_mapping_idx]; OPENVINO_ASSERT(subsequence_begins_mem->get_layout().data_type == data_types::i32); - auto& stream = get_network().get_stream(); - mem_lock past_lens_mem_lock(past_lens_mem, stream); - mem_lock subsequence_begins_mem_lock(subsequence_begins_mem, stream); mem_lock blocks_indexes_start_lock(blocks_indexes_start_mem, stream); mem_lock blocks_indexes_end_lock(blocks_indexes_end_mem, stream); mem_lock blocked_gws_subseq_mapping_mem_lock(blocked_gws_subseq_mapping_mem, stream); std::unique_ptr> sequential_gws_subseq_mapping_lock = nullptr; if (stage == PagedAttentionStage::MIXED) { - const auto sequential_gws_subseq_mapping_idx = 6; + const size_t sequential_gws_subseq_mapping_idx = has_scores_output ? 8 : 6; OPENVINO_ASSERT(_intermediates_memory.size() > sequential_gws_subseq_mapping_idx, - "Unexpected number of intermediates buffers for Paged Attention for mixed stage"); + "[GPU] Unexpected number of intermediates buffers for Paged Attention for mixed stage"); auto sequential_gws_subseq_mapping_mem = _intermediates_memory[sequential_gws_subseq_mapping_idx]; sequential_gws_subseq_mapping_lock.reset(new mem_lock(sequential_gws_subseq_mapping_mem, stream)); } size_t index = 0; + size_t subsequence_offsets_acc = 0; const auto target_seq_len_block_size = 16; // TODO: Get block size from the impl for (size_t i = 0; i < subsequence_begins_mem_lock.size() - 1; i++) { const auto past_len = past_lens_mem_lock[i]; @@ -159,6 +217,11 @@ void paged_attention_inst::on_execute() { sequential_gws_subseq_mapping_lock->operator[](idx) = static_cast(i); } } + + if (subsequence_offsets_lock) { + subsequence_offsets_lock->operator[](i) = static_cast(subsequence_offsets_acc); + subsequence_offsets_acc += seq_length + past_len; + } } } diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index bdffb9c4980722..c938be22b816ed 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -228,7 +228,8 @@ void program::init_program() { if (_task_executor == nullptr) _task_executor = program::make_task_executor(_config); _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, - kernel_selector::KernelBase::get_db().get_batch_headers())); + kernel_selector::KernelBase::get_db().get_batch_headers(), + kernel_selector::KernelBase::get_db().get_cm_batch_headers())); _kernels_cache->set_kernels_reuse(get_config().get_property(ov::intel_gpu::hint::enable_kernels_reuse)); @@ -1501,6 +1502,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::strided_slice::type_id() && prim.type() != cldnn::region_yolo::type_id() && prim.type() != cldnn::normalize::type_id() && + prim.type() != cldnn::group_normalization::type_id() && prim.type() != cldnn::mvn::type_id() && prim.type() != cldnn::gather::type_id() && prim.type() != cldnn::scatter_nd_update::type_id() && @@ -1581,6 +1583,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::deconvolution::type_id() && prim.type() != cldnn::multiclass_nms::type_id() && prim.type() != cldnn::normalize::type_id() && + prim.type() != cldnn::group_normalization::type_id() && prim.type() != cldnn::deconvolution::type_id() && prim.type() != cldnn::unique_count::type_id() && prim.type() != cldnn::unique_gather::type_id() && diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt index 2b32423f9ce3a8..0c29c8afb9ff01 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt @@ -11,11 +11,16 @@ file(GLOB_RECURSE LIBRARY_SRC "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" ) +list(FILTER LIBRARY_SRC EXCLUDE REGEX "${CMAKE_CURRENT_SOURCE_DIR}/cm_kernels/.*" ) file(GLOB_RECURSE KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/cl_kernels/*.cl" ) +file(GLOB_RECURSE CM_KERNELS + "${CMAKE_CURRENT_SOURCE_DIR}/cm_kernels/*" +) + # Path which points to root directory where code generated elements are created # (specific to build configuration). set(CODEGEN_DIR "${CMAKE_CURRENT_BINARY_DIR}/codegen") @@ -28,8 +33,12 @@ set(CODEGEN_INCDIR "${CODEGEN_DIR}/include") set(PRIM_DB "ks_primitive_db.inc") set(PRIM_DB_BATCH_HEADERS "ks_primitive_db_batch_headers.inc") +set(CM_PRIM_DB "ks_cm_primitive_db.inc") +set(CM_PRIM_DB_BATCH_HEADERS "ks_cm_primitive_db_batch_headers.inc") set(CODEGEN_CACHE_SOURCES "${CODEGEN_INCDIR}/${PRIM_DB}" - "${CODEGEN_INCDIR}/${PRIM_DB_BATCH_HEADERS}") + "${CODEGEN_INCDIR}/${PRIM_DB_BATCH_HEADERS}" + "${CODEGEN_INCDIR}/${CM_PRIM_DB}" + "${CODEGEN_INCDIR}/${CM_PRIM_DB_BATCH_HEADERS}") set(CODEGEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/primitive_db_gen.py") # Helping with some generators. @@ -52,6 +61,22 @@ add_custom_command(OUTPUT "${CODEGEN_INCDIR}/${PRIM_DB}" COMMENT "Updating file if the file changed (${PRIM_DB}) ..." ) +add_custom_command(OUTPUT "${CODEGEN_CACHE_DIR}/${CM_PRIM_DB}" + COMMAND "${Python3_EXECUTABLE}" "${CODEGEN_SCRIPT}" -out_path "${CODEGEN_CACHE_DIR}" + -out_file_name_prim_db "${CM_PRIM_DB}" + -out_file_name_batch_headers "${CM_PRIM_DB_BATCH_HEADERS}" + -kernels "${CMAKE_CURRENT_SOURCE_DIR}/cm_kernels" -cm + DEPENDS ${CM_KERNELS} "${CODEGEN_SCRIPT}" "${CODEGEN_INCDIR}/${PRIM_DB}" + COMMENT "Generating ${CODEGEN_CACHE_DIR}/${CM_PRIM_DB} ..." +) + +add_custom_command(OUTPUT "${CODEGEN_INCDIR}/${CM_PRIM_DB}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${CODEGEN_CACHE_DIR}/${CM_PRIM_DB}" "${CODEGEN_INCDIR}/${CM_PRIM_DB}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${CODEGEN_CACHE_DIR}/${CM_PRIM_DB_BATCH_HEADERS}" "${CODEGEN_INCDIR}/${CM_PRIM_DB_BATCH_HEADERS}" + DEPENDS "${CODEGEN_CACHE_DIR}/${CM_PRIM_DB}" "${CM_KERNELS}" "${CODEGEN_SCRIPT}" "${CODEGEN_INCDIR}/${PRIM_DB}" + COMMENT "Updating file if the file changed (${CM_PRIM_DB}) ..." +) + add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${CODEGEN_CACHE_SOURCES}) if(NOT BUILD_SHARED_LIBS) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl index 542fa69ebc241b..109fa2de9841aa 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl @@ -122,8 +122,8 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)( { #endif // SLM_DIV_FACTOR > 1 vec_t src = 0; -#if INPUT_LEFTOVERS - if ((k + 1) * FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM) + + if (INPUT_LEFTOVERS && ((k + 1) * FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)) { if (k * FEATURE_SLICE_SIZE + sglid < INPUT0_FEATURE_NUM) { @@ -143,7 +143,6 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)( } } else -#endif // INPUT_LEFTOVERS { #if PADDED_INPUT #if X_BLOCK_SIZE > 1 diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 01c8e8853e350d..6a5c9e54a8e904 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -601,8 +601,10 @@ inline void FUNC(fc_bf_tiled_kernel_default)( #endif #if TILE_OFM > 1 ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds; + acc_tmp[bi][fi] = 0; #else acc[bi] += acc_tmp[bi] * ds; + acc_tmp[bi] = 0; #endif } } @@ -972,7 +974,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( // ===================================================================================================================================== // Main computation loop const uint iterations = MAIN_LOOP_ELEMENTS_COUNT / TILE_IFM_ELEMENTS_SIZE; // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD) - // Each sub-group loads 2 Batch + // Each sub-group loads 2 Batch uint idx_sglid = (sglid * TILE_K) % TILE_IFM_ELEMENTS_SIZE; // same index for sglid 0~7 : to tile_k direction uint batch_sglid = (sglid * TILE_K) / TILE_IFM_ELEMENTS_SIZE; // 0 to 1 : to batch direction diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl index ca5c1ea3646d02..3f5796a30933ac 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl @@ -147,9 +147,7 @@ inline void (FUNC_NAME)( // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes, // but significantly degrades readability and generality of code. // It doesn't also show noticable performance improvement on tested configurations. - #if DECOMPRESSION_SCALE_POST_OP - ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { }; - #endif + ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { }; unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) { #if COMPRESSED_WEIGHTS_INT4 @@ -201,11 +199,7 @@ inline void (FUNC_NAME)( unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD); unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) { -#if DECOMPRESSION_SCALE_POST_OP ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; -#else - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; -#endif } } } @@ -240,9 +234,20 @@ inline void (FUNC_NAME)( ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; #endif ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds; + acc_tmp[bi][fi] = 0; } } #endif + +#if !DECOMPRESSION_SCALE_POST_OP + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi]; + } + } +#endif + + } // ===================================================================================================================================== // Leftovers diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl index 00c43829d02ea7..7e960afa4b87d3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl @@ -44,6 +44,10 @@ KERNEL(pa_sdpa_opt)( const __global ALIBI_INPUT_TYPE* alibi_slopes, #endif __global OUTPUT_TYPE* output, +#if PAGED_ATTENTION_SCORES_OUTPUT + __global SOFTMAX_ACCUMULATOR_TYPE* softmax_results, + const __global int* subsequence_offsets, +#endif __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums, __global SOFTMAX_ACCUMULATOR_TYPE* max_logits, __global OUTPUT_TYPE* tmp_out @@ -276,6 +280,28 @@ KERNEL(pa_sdpa_opt)( const uint max_logits_offset = exp_sums_offset; max_logits[max_logits_offset] = qk_max; } + +#if PAGED_ATTENTION_SCORES_OUTPUT +#if MULTI_TOKENS_PROCESSING + const uint subsequence_idx = gws_subseq_mapping[seq_idx]; + const uint subsequence_start_pos = subsequence_begins[subsequence_idx]; + const uint subsequence_end_pos = subsequence_begins[subsequence_idx + 1]; + const bool save_softmax_results = seq_idx == subsequence_end_pos - 1; +#else + const uint subsequence_idx = seq_idx; + const bool save_softmax_results = true; +#endif // MULTI_TOKENS_PROCESSING + // PagedAttention is supposed to save only last "row" of the QK matrix multiplication, + // so save SEQ_LEN_PARTITION_SIZE elements for each partition + if (save_softmax_results) { + const uint output_offset = subsequence_idx * HEADS_NUM * total_partitions_num * SEQ_LEN_PARTITION_SIZE + + head_num_idx * total_partitions_num * SEQ_LEN_PARTITION_SIZE + + partition_idx * SEQ_LEN_PARTITION_SIZE; + for (uint i = sgid * SUBGROUP_SIZE + sglid; i < SEQ_LEN_PARTITION_SIZE; i += SUBGROUPS_PER_WG * SUBGROUP_SIZE) { + softmax_results[output_offset + i] = slm_qk_vals[i]; + } + } +#endif // PAGED_ATTENTION_SCORES_OUTPUT } } @@ -370,6 +396,10 @@ KERNEL(pa_sdpa_finalization_stage)( const __global INPUT6_TYPE* subsequence_begins, #endif __global OUTPUT_TYPE* output, +#if PAGED_ATTENTION_SCORES_OUTPUT + __global SOFTMAX_ACCUMULATOR_TYPE* softmax_results, + const __global int* subsequence_offsets, +#endif const __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums, const __global SOFTMAX_ACCUMULATOR_TYPE* max_logits, const __global OUTPUT_TYPE* tmp_out, @@ -500,3 +530,155 @@ KERNEL(pa_sdpa_finalization_stage)( } #endif + +#ifdef SDPA_STAGE_2 +#define MAX_PARTITIONS_NUM 128 + +REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +KERNEL(pa_sdpa_scores_calculation)( + const __global INPUT3_TYPE* past_lens, + const __global INPUT6_TYPE* subsequence_begins, + __global OUTPUT1_TYPE* scores_output, + const __global SOFTMAX_ACCUMULATOR_TYPE* softmax_output, + const __global int* subsequence_offsets, + const __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums, + const __global SOFTMAX_ACCUMULATOR_TYPE* max_logits, + const __global OUTPUT_TYPE* tmp_out, + const uint is_mixed_mode) { + const uint subsequence_idx = get_global_id(2); + const uint partition_global_idx = get_global_id(0); + const uint local_id = get_local_id(0); + const uint partition_idx = get_group_id(0); + const uint partition_size = get_local_size(0); + const uint max_seq_len = get_global_size(0); + const uint partitions_num = get_num_groups(0); + const uint sgid = get_sub_group_id(); + const uint sgid_num = get_num_sub_groups(); + const uint sglid = get_sub_group_local_id(); + + const int subsequence_begin = subsequence_begins[subsequence_idx]; + const int subsequence_end = subsequence_begins[subsequence_idx + 1]; + const uint seq_len = (subsequence_end - subsequence_begin) + past_lens[subsequence_idx]; + + const uint num_of_partitions = CEIL_DIV(seq_len, partition_size); + + if (partition_idx >= num_of_partitions) + return; + + __local SOFTMAX_ACCUMULATOR_TYPE slm_exp_sums[HEADS_NUM]; + __local SOFTMAX_ACCUMULATOR_TYPE slm_global_exp_sum[HEADS_NUM]; + + SOFTMAX_ACCUMULATOR_TYPE total_score = SOFTMAX_ACCUMULATOR_VAL_ZERO; + if (seq_len <= partition_size) { + // If seq_len is less than the partition size, just reduce the results over the heads + for (uint head_idx = 0; head_idx < HEADS_NUM; head_idx++) { + const uint input_offset = subsequence_idx * HEADS_NUM * max_seq_len + head_idx * max_seq_len + partition_global_idx; + SOFTMAX_ACCUMULATOR_TYPE softmax_value = softmax_output[input_offset]; + total_score += softmax_value; + } + } else if (seq_len <= partition_size * MAX_PARTITIONS_NUM) { + // Optimized version for longer prompts (up to partition_size * MAX_PARTITIONS_NUM, ~64K tokens) + + // Depending on the previous kernel exp_sums and max_logits might have different structure: + // For ordinary 1st and 2nd token kernels, there is only a single entry per subsequence. + // However, for mixed mode execution, exp_sums and max_logits include information for all + // tokens of each subsequence, but only the last one is needed for score calculation. + const uint subsequence_pos = is_mixed_mode ? subsequence_end - 1 : subsequence_idx; + + for (uint head_idx = sgid; head_idx < HEADS_NUM; head_idx += sgid_num) { + SOFTMAX_ACCUMULATOR_TYPE max_logit[MAX_PARTITIONS_NUM / SUBGROUP_SIZE]; + SOFTMAX_ACCUMULATOR_TYPE exp_sum[MAX_PARTITIONS_NUM / SUBGROUP_SIZE]; + + const uint exp_sums_offset = subsequence_pos * HEADS_NUM * partitions_num + head_idx * partitions_num; + for (int i = 0; i < partitions_num / SUBGROUP_SIZE; i++) { + max_logit[i] = max_logits[exp_sums_offset + i * SUBGROUP_SIZE + sglid]; + exp_sum[i] = exp_sums[exp_sums_offset + i * SUBGROUP_SIZE + sglid]; + } + + const uint partitions_leftovers = partitions_num % SUBGROUP_SIZE; + if (partitions_leftovers != 0) { + const uint idx = partitions_num / SUBGROUP_SIZE; + max_logit[idx] = sglid >= partitions_leftovers ? SOFTMAX_ACCUMULATOR_VAL_MIN : max_logits[exp_sums_offset + idx * SUBGROUP_SIZE + sglid]; + exp_sum[idx] = sglid >= partitions_leftovers ? SOFTMAX_ACCUMULATOR_VAL_ZERO : exp_sums[exp_sums_offset + idx * SUBGROUP_SIZE + sglid]; + } + + SOFTMAX_ACCUMULATOR_TYPE global_max_logit = max_logit[0]; + for (uint i = 1; i < CEIL_DIV(partitions_num, SUBGROUP_SIZE); i++) { + global_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(global_max_logit, max_logit[i]); + } + + global_max_logit = sub_group_reduce_max(global_max_logit); + + SOFTMAX_ACCUMULATOR_TYPE global_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO; + for (uint i = 0; i < CEIL_DIV(partitions_num, SUBGROUP_SIZE); i++) { + SOFTMAX_ACCUMULATOR_TYPE adjusted_exp_sum = exp_sum[i] * native_exp(max_logit[i] - global_max_logit); + // slm_exp_sums[head_idx][i * SUBGROUP_SIZE + sglid] = adjusted_exp_sum; + if (i * SUBGROUP_SIZE + sglid == partition_idx) + slm_exp_sums[head_idx] = adjusted_exp_sum; + global_exp_sum += adjusted_exp_sum; + } + + global_exp_sum = sub_group_reduce_add(global_exp_sum); + + slm_global_exp_sum[head_idx] = global_exp_sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (uint head_idx = 0; head_idx < HEADS_NUM; head_idx++) { + SOFTMAX_ACCUMULATOR_TYPE adjusted_exp_sum = slm_exp_sums[head_idx]; + SOFTMAX_ACCUMULATOR_TYPE global_exp_sum = slm_global_exp_sum[head_idx]; + + const uint input_offset = subsequence_idx * HEADS_NUM * max_seq_len + head_idx * max_seq_len + partition_global_idx; + SOFTMAX_ACCUMULATOR_TYPE softmax_value = softmax_output[input_offset]; + + softmax_value = softmax_value * adjusted_exp_sum / global_exp_sum; + total_score += softmax_value; + } + } else { + // Non optimized fallback version + const uint subsequence_pos = is_mixed_mode ? subsequence_end - 1 : subsequence_idx; + for (uint head_idx = 0; head_idx < HEADS_NUM; head_idx++) { + SOFTMAX_ACCUMULATOR_TYPE global_max_logit = SOFTMAX_ACCUMULATOR_VAL_MIN; + const uint max_logits_base_offset = subsequence_pos * HEADS_NUM * partitions_num + head_idx * partitions_num; + for (uint i = 0; i < CEIL_DIV(partitions_num, SUBGROUP_SIZE); i++) { + const uint partition_offset = i * SUBGROUP_SIZE + sglid; + SOFTMAX_ACCUMULATOR_TYPE max_logit = partition_offset >= partitions_num ? SOFTMAX_ACCUMULATOR_VAL_MIN : max_logits[max_logits_base_offset + partition_offset]; + global_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(global_max_logit, max_logit); + } + + global_max_logit = sub_group_reduce_max(global_max_logit); + + SOFTMAX_ACCUMULATOR_TYPE global_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO; + SOFTMAX_ACCUMULATOR_TYPE partition_adjusted_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO; + const uint exp_sums_base_offset = subsequence_pos * HEADS_NUM * partitions_num + head_idx * partitions_num; + for (uint i = 0; i < CEIL_DIV(partitions_num, SUBGROUP_SIZE); i++) { + const uint partition_offset = i * SUBGROUP_SIZE + sglid; + SOFTMAX_ACCUMULATOR_TYPE exp_sum = partition_offset >= partitions_num ? SOFTMAX_ACCUMULATOR_VAL_ZERO : exp_sums[exp_sums_base_offset + partition_offset]; + SOFTMAX_ACCUMULATOR_TYPE max_logit = partition_offset >= partitions_num ? SOFTMAX_ACCUMULATOR_VAL_MIN : max_logits[max_logits_base_offset + partition_offset]; + SOFTMAX_ACCUMULATOR_TYPE adjusted_exp_sum = exp_sum * native_exp(max_logit - global_max_logit); + global_exp_sum += adjusted_exp_sum; + + // Save and broadcast the adjusted exp_sum for the currently being processed partition + if (i == partition_idx / SUBGROUP_SIZE) + partition_adjusted_exp_sum = sub_group_broadcast(adjusted_exp_sum, partition_idx % SUBGROUP_SIZE); + } + + global_exp_sum = sub_group_reduce_add(global_exp_sum); + + const uint input_offset = subsequence_idx * HEADS_NUM * max_seq_len + head_idx * max_seq_len + partition_global_idx; + SOFTMAX_ACCUMULATOR_TYPE softmax_value = softmax_output[input_offset]; + + softmax_value = softmax_value * partition_adjusted_exp_sum / global_exp_sum; + total_score += softmax_value; + } + } + + const uint output_offset = subsequence_offsets[subsequence_idx]; + if (partition_global_idx < seq_len) { + scores_output[output_offset + partition_global_idx] = total_score; + } +} + +#undef MAX_PARTITIONS_NUM +#endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl index 95f0d0ff399a3b..ee27d220e30ce9 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl @@ -66,10 +66,7 @@ KERNEL (reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx)( #if (TILE_SIZE == DEFAULT_TILE_SIZE) - // read - INPUTVTYPE read_data = AS_INPUTVTYPE(_sub_group_block_read8((const __global uint*)(input) + input_idx_tile)); - - // write + // write index const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER); if (F_NO_REMAINDER_CONDITION @@ -79,13 +76,25 @@ KERNEL (reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx)( ) { #ifdef X_REMAINDER_SIZE if (X_REMAINDER_CONDITION) { + // read + INPUTVTYPE read_data; + for (int j = 0; j < X_REMAINDER_SIZE; ++j) { + read_data[j] = AS_INPUT0_TYPE(_sub_group_block_read((const __global uint*)(input) + input_idx_tile + j * DEFAULT_STRIDE)); + } + // write for (int i = 0 ; i < X_REMAINDER_SIZE; i++) { output[output_idx + i] = TO_OUTPUT_TYPE(read_data[i]); } } else { + // read + INPUTVTYPE read_data = AS_INPUTVTYPE(_sub_group_block_read8((const __global uint*)(input) + input_idx_tile)); + // write VSTORE(TO_OUTPUTVTYPE(read_data), 0, output + output_idx); } #else + // read + INPUTVTYPE read_data = AS_INPUTVTYPE(_sub_group_block_read8((const __global uint*)(input) + input_idx_tile)); + // write VSTORE(TO_OUTPUTVTYPE(read_data), 0, output + output_idx); #endif } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_bfyx_to_blocked_format.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_bfyx_to_blocked_format.cl index 45d0ccc5c0933e..2f403b798dea39 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_bfyx_to_blocked_format.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_bfyx_to_blocked_format.cl @@ -26,6 +26,18 @@ } \ } +#define FUNC_LOAD_LEFTOVERS(inner, outer) unroll_for (uint lh = 0; lh < outer; ++lh) { \ + const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER); \ + INPUTVTYPE read_data; \ + unroll_for (uint lw = 0; lw < inner; ++lw) { \ + read_data[lw] = input[input_idx + lw]; \ + } \ + unroll_for (uint lw = 0; lw < inner; ++lw) { \ + const uint dst = local_buf_offset + lw; \ + transpose_buf[dst][lh] = read_data[lw]; \ + } \ + } + #define FUNC_VSTORE(loop) unroll_for (uint lw = 0; lw < loop; ++lw) { \ const uint output_idx = output_idx_tile + (lw * x_pitch); \ VSTORE(TO_OUTPUTVTYPE(transpose_buf[local_buf_offset + lw]), 0, output + output_idx); \ @@ -109,7 +121,15 @@ KERNEL (reorder_data_bfyx_to_blocked_format)( if (F_NO_REMAINDER_CONDITION) { // read and transpose +#ifdef X_REMAINDER_CONDITION + if (X_NO_REMAINDER_CONDITION) { + FUNC_VLOAD(TILE_SIZE, TILE_SIZE) + } else { + FUNC_LOAD_LEFTOVERS(X_REMAINDER_SIZE, TILE_SIZE) + } +#else FUNC_VLOAD(TILE_SIZE, TILE_SIZE) +#endif // write to ddr #ifdef X_REMAINDER_CONDITION @@ -125,7 +145,15 @@ KERNEL (reorder_data_bfyx_to_blocked_format)( #ifdef F_REMAINDER_CONDITION else if (F_REMAINDER_CONDITION) { // read and transpose + #ifdef X_REMAINDER_CONDITION + if (X_NO_REMAINDER_CONDITION) { + FUNC_VLOAD(TILE_SIZE, F_REMAINDER_SIZE) + } else { + FUNC_LOAD_LEFTOVERS(X_REMAINDER_SIZE, F_REMAINDER_SIZE) + } + #else FUNC_VLOAD(TILE_SIZE, F_REMAINDER_SIZE) + #endif // write to ddr #ifdef X_REMAINDER_CONDITION diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 55f87e4189d9fe..cddafe62623d9e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -842,6 +842,14 @@ KERNEL(sdpa_opt)( const __global int* blocked_indexes_start, const __global int* blocked_indexes_end, const __global int* gws_seq_indexes_correspondence +#if PAGED_ATTENTION_SCORES_OUTPUT + , __global SOFTMAX_ACCUMULATOR_TYPE* softmax_results + , const __global int* subsequence_offsets + , __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums + , __global SOFTMAX_ACCUMULATOR_TYPE* max_logits + , __global OUTPUT_TYPE* tmp_out + , const uint aligned_max_context_len +#endif #else __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums, __global SOFTMAX_ACCUMULATOR_TYPE* max_logits, @@ -1222,6 +1230,39 @@ KERNEL(sdpa_opt)( slm_qk_vals[sglid * SEQ_LEN_PARTITION_SIZE + sgid * TARGET_SEQ_LEN_BLOCK_SIZE + i] = qk_acc[i]; } +#if PAGED_ATTENTION_SCORES_OUTPUT + const uint subsequence_idx = gws_seq_indexes_correspondence[target_seq_dim]; + const uint subsequence_end_pos = subsequence_begins[subsequence_idx + 1]; + const uint block_start_pos = blocked_indexes_start[target_seq_dim]; + const uint block_end_pos = blocked_indexes_end[target_seq_dim]; + + // PagedAttention is supposed to save only last "row" of the QK matrix multiplication, + // so save SEQ_LEN_PARTITION_SIZE elements for each partition + if (subsequence_end_pos == block_end_pos) { + const uint last_row_idx = block_end_pos - block_start_pos - 1; + if (sglid == last_row_idx) { + const uint partition_idx = start_partition_idx / SEQ_LEN_PARTITION_SIZE; + + if (sgid == 0) { + const uint max_partitions_num = aligned_max_context_len / SEQ_LEN_PARTITION_SIZE; + const uint exp_sums_output_offset = subsequence_idx * NUM_HEADS * max_partitions_num + + num_heads_dim * max_partitions_num + + partition_idx; + exp_sums[exp_sums_output_offset] = exp_sum_new; + max_logits[exp_sums_output_offset] = qk_max_new; + } + + const uint output_offset = subsequence_idx * NUM_HEADS * aligned_max_context_len + + num_heads_dim * aligned_max_context_len + + partition_idx * SEQ_LEN_PARTITION_SIZE + sgid * TARGET_SEQ_LEN_BLOCK_SIZE; + for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { + softmax_results[output_offset + i] = qk_acc[i]; + } + + } + } +#endif + barrier(CLK_LOCAL_MEM_FENCE); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/example.cpp b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/example.cpp new file mode 100644 index 00000000000000..abee70f6483d17 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/example.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +namespace KERNEL_NAME { + +#include "include/example_header.h" + +extern "C" _GENX_MAIN_ void KERNEL_NAME(svmptr_t x [[type("svmptr_t")]]) { + // This kernel prints and exits + if (cm_linear_global_id() == 0) { + printf("Example CM kernel\n"); + printf("Pointer address: %p\n", (void*)x); + + // Call function from header + print_lws_gws(); + + // Check macro from batch header +#ifdef EXAMPLE_CM_MACRO + printf("Batch header included\n"); +#else + printf("Batch header not included\n"); +#endif + } +} +} // namespace KERNEL_NAME diff --git a/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/batch_headers/exmaple_batch_header.h b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/batch_headers/exmaple_batch_header.h new file mode 100644 index 00000000000000..f3f2aa183e88dc --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/batch_headers/exmaple_batch_header.h @@ -0,0 +1,5 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#define EXAMPLE_CM_MACRO diff --git a/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/example_header.h b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/example_header.h new file mode 100644 index 00000000000000..3ce3a33188d0fc --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/example_header.h @@ -0,0 +1,8 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +inline void print_lws_gws() { + printf("lws: %d, %d, %d\n", cm_local_size(0), cm_local_size(1), cm_local_size(2)); + printf("gws: %d, %d, %d\n", cm_group_count(0), cm_group_count(1), cm_group_count(2)); +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_cm.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_cm.h new file mode 100644 index 00000000000000..32744f65bee7e0 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_cm.h @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base.h" + +namespace kernel_selector { + +class KernelBaseCM : public KernelBase { +public: + using KernelBase::KernelBase; + virtual ~KernelBaseCM() {} + +protected: + virtual bool Validate(const Params&) const { + return true; + } + std::shared_ptr GetKernelString(const std::string& kernel_name, + const std::pair& jit, + const std::string& entry_point) const { + std::shared_ptr kernel_string = std::make_shared(); + + bool is_cm = true; + auto codes = db.get(kernel_name, is_cm); + + if (codes.size()) { + kernel_string->str = codes[0]; + kernel_string->jit = "#include \n#include \n"; + kernel_string->jit += jit.first; + kernel_string->undefs = jit.second; + kernel_string->options = " -cmc "; + + kernel_string->entry_point = entry_point; + kernel_string->batch_compilation = true; + kernel_string->language = KernelLanguage::CM; + } + + return kernel_string; + } +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h index d9b132ac1dcc43..b55740110b2f28 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h @@ -46,6 +46,7 @@ namespace kernel_selector { std::string GetStringEnv(const char* varName); +using KernelLanguage = cldnn::kernel_language; using KernelString = cldnn::kernel_string; using WorkGroupSizes = cldnn::work_group_sizes; using ScalarDescriptor = cldnn::scalar_desc; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index 6fd074f8d8506d..7150d51ecf1e48 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -264,6 +264,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut } if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) { jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1)); + } else { + jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 0)); } } else { DimensionAccessHelperJit input0_dims(params.inputs[0]); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.cpp new file mode 100644 index 00000000000000..32719501d937d2 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.cpp @@ -0,0 +1,54 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fully_connected_cm_example.h" + +namespace kernel_selector { +KernelsData FullyConnected_cm_example::GetKernelsData(const Params& params) const { + if (!Validate(params)) { + return {}; + } + auto options = std::string(" -Qxcm_jit_option=-DPASTokenReduction "); + + KernelData kd = KernelData::Default(params, 1); + auto& kernel = kd.kernels[0]; + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); + kernel.params.workGroups.local = {1, 2, 4}; + kernel.params.workGroups.global = {1, 4, 8}; + + std::string kernel_name = "fully_connected_cm_example"; + auto jit = std::pair("\n#define KERNEL_NAME " + kernel_name, "#undef KERNEL_NAME"); + kernel.code.kernelString = GetKernelString("example", jit, kernel_name); + kernel.code.kernelString->options += options; + kernel.code.kernelString->batch_compilation = true; + return {kd}; +} +KernelsPriority FullyConnected_cm_example::GetKernelsPriority(const Params& params) const { + return TUTORIAL_PRIORITY; +} +ParamsKey FullyConnected_cm_example::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F16); + k.EnableInputWeightsType(WeightsType::F16); + k.EnableInputWeightsType(WeightsType::UINT8); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableDifferentInputWeightsTypes(); + k.EnableDifferentTypes(); + k.EnableBiasPerOutput(); + k.EnableBiasPerFeature(); + k.EnableNonBiasTerm(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableQuantization(QuantizationType::SYMMETRIC); + k.EnableWeightsCompression(); + return k; +} +bool FullyConnected_cm_example::Validate(const Params& p) const { + return true; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.h new file mode 100644 index 00000000000000..844f3395bd8430 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.h @@ -0,0 +1,21 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "../fully_connected_params.h" +#include "kernel_base_cm.h" + +namespace kernel_selector { +class FullyConnected_cm_example : public KernelBaseCM { +public: + FullyConnected_cm_example() : KernelBaseCM("fully_connected_example") {} + virtual ~FullyConnected_cm_example() {} + + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.cpp new file mode 100644 index 00000000000000..dfc6d4342b1490 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.cpp @@ -0,0 +1,17 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fully_connected_cm_kernel_selector.h" + +#include "fully_connected_cm_example.h" + +namespace kernel_selector { +fully_connected_cm_kernel_selector::fully_connected_cm_kernel_selector() { + Attach(); +} + +KernelsData fully_connected_cm_kernel_selector::GetBestKernels(const Params& params) const { + return GetAutoTuneBestKernel(params, KernelType::FULLY_CONNECTED); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.h new file mode 100644 index 00000000000000..937d605f9ebad2 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.h @@ -0,0 +1,24 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { + +class fully_connected_cm_kernel_selector : public kernel_selector_base { +public: + static fully_connected_cm_kernel_selector& Instance() { + static fully_connected_cm_kernel_selector instance_; + return instance_; + } + + fully_connected_cm_kernel_selector(); + + virtual ~fully_connected_cm_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp index ddfb491f50278a..ce20f49de597ff 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp @@ -167,7 +167,7 @@ void KVCacheUpdateKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) const { const auto indexes_dt = Datatype::INT32; const auto target_seq_len_block_size = 16; - const auto target_seq_len = prim_params.conf.paged_attention_aligned_seq_len; + const auto target_seq_len = std::max(prim_params.conf.paged_attention_aligned_seq_len, static_cast(1)); const auto indexes_buf_size = CeilDiv(target_seq_len, target_seq_len_block_size) * BytesPerElement(indexes_dt); kd.internalBufferSizes.clear(); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp index 63c5e74160f652..909a40d677f535 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "sdpa_kernel_opt.h" #include "pa_sdpa_kernel_opt.h" #include "kernel_selector_params.h" @@ -15,6 +16,7 @@ enum KernelsTypes { MULTI_TOKENS, FINALIZATION, FINALIZATION_MULTI_TOKENS, + SCORES_CALCULATION, TOTAL_KERNELS_NUM }; @@ -35,6 +37,8 @@ static std::string GetKernelName(std::string base_name, KernelsTypes type) { kernel_name += "_finalization"; } else if (type == KernelsTypes::FINALIZATION_MULTI_TOKENS) { kernel_name += "_finalization_multi_tokens_seq"; + } else if (type == KernelsTypes::SCORES_CALCULATION) { + kernel_name += "_scores_calculation"; } return kernel_name; @@ -46,10 +50,15 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const { } const auto& params = static_cast(p); - const std::vector kernels_type = { KernelsTypes::SINGLE_TOKEN, - KernelsTypes::MULTI_TOKENS, - KernelsTypes::FINALIZATION, - KernelsTypes::FINALIZATION_MULTI_TOKENS }; + std::vector kernels_type = { KernelsTypes::SINGLE_TOKEN, + KernelsTypes::MULTI_TOKENS, + KernelsTypes::FINALIZATION, + KernelsTypes::FINALIZATION_MULTI_TOKENS }; + + const auto has_scores_output = params.outputs.size() > 1; + if (has_scores_output) { + kernels_type.push_back(KernelsTypes::SCORES_CALCULATION); + } KernelData kd = KernelData::Default(params, kernels_type.size()); kd.needs_sub_kernels_sync = true; @@ -65,7 +74,8 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const { const auto jit = CreateJit(kernel_name, jit_constants, entry_point); - size_t inputs_num = static_cast(params.inputs.size()); + int inputs_num = static_cast(params.inputs.size()); + int outputs_num = 1; if (kernel_type == KernelsTypes::SINGLE_TOKEN) { // SINGLE_TOKEN kernel doesn't use the subsequence_begins input inputs_num -= 1; @@ -75,6 +85,11 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const { } else if (kernel_type == KernelsTypes::FINALIZATION_MULTI_TOKENS) { // FINALIZATION_MULTI_TOKENS kernel uses past_lens data input and subsequence_begins inputs_num = 2; + } else if (kernel_type == KernelsTypes::SCORES_CALCULATION) { + // SCORES_CALCULATION kernel uses past_lens data input and subsequence_begins + inputs_num = 2; + // Output is configured manually to use the second output memory buffer + outputs_num = 0; } auto& kernel = kd.kernels[kd_kernels_idx++]; @@ -87,19 +102,33 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const { {}, false, false, - static_cast(inputs_num), + inputs_num, GetFusedPrimitiveInputsCount(params), - static_cast(params.outputs.size()), + outputs_num, params.is_shape_agnostic); - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2}); + if (kernel_type == KernelsTypes::SCORES_CALCULATION) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 1}); + } + + uint32_t internal_buffers_num = 0; + if (has_scores_output) { + // Intermediate softmax results for scores output calculation and precalculated accumulated + // sequence length offsets for each subsequence + internal_buffers_num += 2; + } + + // Softmax's exp_sums, max_logits and intermediate output + internal_buffers_num += 3; if (kernel_type == KernelsTypes::MULTI_TOKENS || kernel_type == KernelsTypes::FINALIZATION_MULTI_TOKENS) { // MULTIPLE_TOKENS kernels needs additional information related to mapping // launched kernel instances to subsequence indexes - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 3}); + internal_buffers_num++; + } + + for (uint32_t i = 0; i < internal_buffers_num; i++) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, i}); } if (kernel_type == KernelsTypes::FINALIZATION || kernel_type == KernelsTypes::FINALIZATION_MULTI_TOKENS) { @@ -108,6 +137,15 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const { // Remove unused shape_info argument at finalization stage kernel.params.arguments.erase(kernel.params.arguments.begin()); } + + if (kernel_type == KernelsTypes::SCORES_CALCULATION) { + // The scores kernel needs to know if the current execution mode is mixed or ordinary + // to configure proper memory access + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); + + // Remove unused shape_info argument for scores kernel + kernel.params.arguments.erase(kernel.params.arguments.begin()); + } } return {kd}; @@ -173,7 +211,12 @@ JitConstants PagedAttentionSDPAKernelOpt::GetJitConstants(const pa_sdpa_params& jit.AddConstant(MakeJitConstant("BROADCAST_GROUP_SIZE", config.group_size)); } - auto sdpa_stage = kernel_idx == KernelsTypes::FINALIZATION || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS ? 1 : 0; + auto sdpa_stage = 0; + if (kernel_idx == KernelsTypes::FINALIZATION || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS) { + sdpa_stage = 1; + } else if (kernel_idx == KernelsTypes::SCORES_CALCULATION) { + sdpa_stage = 2; + } jit.AddConstant(MakeJitConstant("SDPA_STAGE_" + std::to_string(sdpa_stage), 1)); if (config.has_const_scale_val) { @@ -190,6 +233,10 @@ JitConstants PagedAttentionSDPAKernelOpt::GetJitConstants(const pa_sdpa_params& jit.Merge(MakeTypeJitConstants(params.inputs[alibi_input_idx].GetDType(), "ALIBI_INPUT")); } + if (params.outputs.size() > 1) { + jit.AddConstant(MakeJitConstant("PAGED_ATTENTION_SCORES_OUTPUT", 1)); + } + if (kernel_idx == KernelsTypes::MULTI_TOKENS || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS) jit.AddConstant(MakeJitConstant("MULTI_TOKENS_PROCESSING", 1)); @@ -203,18 +250,36 @@ CommonDispatchData PagedAttentionSDPAKernelOpt::SetDefault(const pa_sdpa_params& const auto& input = params.inputs[0]; if (!input.is_dynamic()) { - const size_t sequences_number = input.Batch().v; - const size_t num_of_partitions = CeilDiv(params.max_context_len, seq_len_partition_size); + const size_t total_tokens = input.Batch().v; + const size_t num_of_partitions = CeilDiv(params.conf.paged_attention_max_len, seq_len_partition_size); const size_t heads_num = static_cast(params.conf.heads_num); const size_t head_size = static_cast(params.conf.head_size); - if (kernel_idx == 0) { - dispatch_data.gws = { sequences_number, + if (kernel_idx == KernelsTypes::SINGLE_TOKEN || kernel_idx == KernelsTypes::MULTI_TOKENS) { + dispatch_data.gws = { total_tokens, heads_num, head_size * num_of_partitions }; dispatch_data.lws = { 1, 1, head_size }; + } else if (kernel_idx == KernelsTypes::SCORES_CALCULATION) { + const auto& past_lens = params.inputs[3]; + const auto subsequences_number = past_lens.Batch().v; + + size_t partition_size = 0; + size_t num_of_partitions = 0; + if (params.stage == PagedAttentionStage::PREFILL) { + partition_size = SDPAKernelOpt::get_seq_len_partition_size(params, params.conf.head_size, 1); + } else { + partition_size = seq_len_partition_size; + } + + num_of_partitions = CeilDiv(params.conf.paged_attention_max_len, partition_size); + + dispatch_data.gws = { partition_size * num_of_partitions, + 1, + subsequences_number }; + dispatch_data.lws = { partition_size, 1, 1 }; } else { - dispatch_data.gws = { sequences_number, + dispatch_data.gws = { total_tokens, heads_num, head_size }; dispatch_data.lws = { 1, 1, subgroup_size }; @@ -228,30 +293,39 @@ void PagedAttentionSDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) cons kd.update_dispatch_data_func = [](const Params& params, KernelData& kd) { const auto& prim_params = static_cast(params); - const size_t expected_kernels_num = 4; - OPENVINO_ASSERT(kd.kernels.size() == expected_kernels_num, "[GPU] Invalid kernels size for update dispatch data func of SDPA kernel"); + const auto has_scores_output = prim_params.outputs.size() > 1; + const auto expected_kernels_num = has_scores_output ? KernelsTypes::TOTAL_KERNELS_NUM : KernelsTypes::TOTAL_KERNELS_NUM - 1; + OPENVINO_ASSERT(kd.kernels.size() == static_cast(expected_kernels_num), + "[GPU] Invalid kernels size for update dispatch data func of SDPA kernel"); + + const auto scores_calc_only = prim_params.stage == PagedAttentionStage::PREFILL && has_scores_output; + const auto multi_tokens_mode = prim_params.stage == PagedAttentionStage::MIXED; auto dispatch_data1 = SetDefault(prim_params, KernelsTypes::SINGLE_TOKEN); kd.kernels[KernelsTypes::SINGLE_TOKEN].params.workGroups.global = dispatch_data1.gws; kd.kernels[KernelsTypes::SINGLE_TOKEN].params.workGroups.local = dispatch_data1.lws; - kd.kernels[KernelsTypes::SINGLE_TOKEN].skip_execution = prim_params.multi_tokens_mode; + kd.kernels[KernelsTypes::SINGLE_TOKEN].skip_execution = multi_tokens_mode || scores_calc_only; kd.kernels[KernelsTypes::MULTI_TOKENS].params.workGroups.global = dispatch_data1.gws; kd.kernels[KernelsTypes::MULTI_TOKENS].params.workGroups.local = dispatch_data1.lws; - kd.kernels[KernelsTypes::MULTI_TOKENS].skip_execution = !prim_params.multi_tokens_mode; + kd.kernels[KernelsTypes::MULTI_TOKENS].skip_execution = !multi_tokens_mode || scores_calc_only; - const auto& input = prim_params.inputs[0]; - const size_t sequences_number = input.Batch().v; - const size_t num_of_partitions = CeilDiv(prim_params.max_context_len, seq_len_partition_size); + size_t partition_size = 0; + if (prim_params.stage == PagedAttentionStage::PREFILL) { + partition_size = SDPAKernelOpt::get_seq_len_partition_size(params, prim_params.conf.head_size, 1); + } else { + partition_size = seq_len_partition_size; + } + const size_t num_of_partitions = CeilDiv(prim_params.conf.paged_attention_max_len, partition_size); auto dispatch_data2 = SetDefault(prim_params, KernelsTypes::FINALIZATION); kd.kernels[KernelsTypes::FINALIZATION].params.workGroups.global = dispatch_data2.gws; kd.kernels[KernelsTypes::FINALIZATION].params.workGroups.local = dispatch_data2.lws; - kd.kernels[KernelsTypes::FINALIZATION].skip_execution = num_of_partitions == 1 || prim_params.multi_tokens_mode; + kd.kernels[KernelsTypes::FINALIZATION].skip_execution = num_of_partitions == 1 || multi_tokens_mode || scores_calc_only; kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].params.workGroups.global = dispatch_data2.gws; kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].params.workGroups.local = dispatch_data2.lws; - kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].skip_execution = num_of_partitions == 1 || !prim_params.multi_tokens_mode; + kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].skip_execution = num_of_partitions == 1 || !multi_tokens_mode || scores_calc_only; ScalarDescriptor num_of_partitions_scalar; num_of_partitions_scalar.t = ScalarDescriptor::Types::UINT32; @@ -261,23 +335,63 @@ void PagedAttentionSDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) cons kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].params.scalars.resize(1); kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].params.scalars[0] = num_of_partitions_scalar; + if (has_scores_output) { + auto dispatch_data = SetDefault(prim_params, KernelsTypes::SCORES_CALCULATION); + kd.kernels[KernelsTypes::SCORES_CALCULATION].params.workGroups.global = dispatch_data.gws; + kd.kernels[KernelsTypes::SCORES_CALCULATION].params.workGroups.local = dispatch_data.lws; + kd.kernels[KernelsTypes::SCORES_CALCULATION].skip_execution = false; + + ScalarDescriptor is_mixed_mode; + is_mixed_mode.t = ScalarDescriptor::Types::UINT32; + is_mixed_mode.v.u32 = static_cast(multi_tokens_mode); + kd.kernels[KernelsTypes::SCORES_CALCULATION].params.scalars.resize(1); + kd.kernels[KernelsTypes::SCORES_CALCULATION].params.scalars[0] = is_mixed_mode; + } + + const auto& input = prim_params.inputs[0]; + const size_t total_tokens = input.Batch().v; + auto buf_dt_size = BytesPerElement(softmax_acc_dt); - auto buf_elements_count = sequences_number * prim_params.conf.heads_num * num_of_partitions; + auto buf_elements_count = total_tokens * prim_params.conf.heads_num * num_of_partitions; auto buf_size = buf_elements_count * buf_dt_size; auto tmp_out_dt_size = BytesPerElement(softmax_acc_dt); - auto tmp_out_elements_count = sequences_number * prim_params.conf.heads_num * prim_params.conf.head_size * num_of_partitions; + auto tmp_out_elements_count = total_tokens * prim_params.conf.heads_num * prim_params.conf.head_size * num_of_partitions; auto tmp_out_size = tmp_out_elements_count * tmp_out_dt_size; kd.internalBufferSizes.clear(); - kd.internalBufferSizes.push_back(buf_size); - kd.internalBufferSizes.push_back(buf_size); - kd.internalBufferSizes.push_back(tmp_out_size); + + if (has_scores_output) { + const auto& past_lens = prim_params.inputs[3]; + auto subsequences_number = past_lens.Batch().v; + auto softmax_buf_dt_size = BytesPerElement(softmax_acc_dt); + + auto softmax_buf_elements_count = subsequences_number * prim_params.conf.heads_num * num_of_partitions * partition_size; + auto softmax_buf_size = softmax_buf_elements_count * softmax_buf_dt_size; + + // Softmax intermediate output + kd.internalBufferSizes.push_back(softmax_buf_size); + // Precalculated accumulated sequence length offsets for each subsequence + kd.internalBufferSizes.push_back(subsequences_number * BytesPerElement(Datatype::INT32)); + + if (prim_params.stage == PagedAttentionStage::PREFILL) { + // Recalculate buf_size as in case of PREFILL stage it's not needed to allocate buffer per each input token + buf_elements_count = subsequences_number * prim_params.conf.heads_num * num_of_partitions; + buf_size = buf_elements_count * buf_dt_size; + + // Intermediate tmp output buffer is not used for PREFILL stage + tmp_out_size = tmp_out_dt_size; + } + } + + kd.internalBufferSizes.push_back(buf_size); // softmax exp_sums + kd.internalBufferSizes.push_back(buf_size); // softmax max_logits + kd.internalBufferSizes.push_back(tmp_out_size); // intermediate output kd.internalBufferDataType = softmax_acc_dt; - if (prim_params.multi_tokens_mode) { + if (multi_tokens_mode) { auto buf_dt_size = BytesPerElement(Datatype::INT32); - auto buf_elements_count = sequences_number; + auto buf_elements_count = total_tokens; auto buf_size = Align(buf_elements_count * buf_dt_size, BytesPerElement(softmax_acc_dt)); kd.internalBufferSizes.push_back(buf_size); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.h index a2456ccd9e2af5..a52571b03691df 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.h @@ -9,11 +9,17 @@ namespace kernel_selector { +enum PagedAttentionStage { + GENERATE = 0, + PREFILL = 1, + MIXED = 2, + UNKNOWN = 3 +}; + struct pa_sdpa_params : base_params { pa_sdpa_params() : base_params(KernelType::PA_SDPA) {} - bool multi_tokens_mode = false; - size_t max_context_len = 0; + PagedAttentionStage stage = PagedAttentionStage::UNKNOWN; sdpa_configuration conf; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index 5cd9c384ff2709..8fcc4a16692d6c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -97,6 +97,7 @@ struct sdpa_configuration { bool is_paged_attention = false; int64_t paged_attention_aligned_seq_len = -1; int64_t paged_attention_block_size = 0; + int64_t paged_attention_max_len = 0; bool has_const_scale_val = false; float scale_val = 0.f; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp index 4e71064efbc895..4c23d4de4fd68d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp @@ -21,38 +21,11 @@ enum KernelsTypes { constexpr size_t subgroup_size = 16; } // namespace -static size_t get_sg_number_scale_factor(const sdpa_params& sdpa_params, size_t kernel_type) { - const size_t optimal_scale_factor = 2; - if (kernel_type == KernelsTypes::MULTI_TOKENS) { - if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize) { - return optimal_scale_factor; - } - } else if (kernel_type == KernelsTypes::SINGLE_TOKEN) { - if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize && - sdpa_params.conf.head_size * optimal_scale_factor / subgroup_size <= subgroup_size) { - return optimal_scale_factor; - } - } - - return 1; -} - static size_t get_target_seq_len_block_size() { const size_t block_size = 16; return block_size; } -static size_t get_seq_len_partition_size(const sdpa_params& sdpa_params, size_t kernel_type) { - size_t seq_len = 0; - if (kernel_type == KernelsTypes::MULTI_TOKENS) { - seq_len = sdpa_params.conf.head_size * get_sg_number_scale_factor(sdpa_params, kernel_type); - } else { - seq_len = 256; - } - - return seq_len; -} - static Datatype get_softmax_acc_type() { return Datatype::F32; } @@ -71,7 +44,7 @@ static size_t get_partitions_num(const sdpa_params& sdpa_params, size_t kernel_t TransposedDimensionAccessHelperBase dims_k(sdpa_params.inputs[1], sdpa_params.input1_order); auto source_seq_len = dims_k.y_dim().v; - return CeilDiv(source_seq_len, get_seq_len_partition_size(sdpa_params, kernel_type)); + return CeilDiv(source_seq_len, SDPAKernelOpt::get_seq_len_partition_size(sdpa_params, sdpa_params.conf.head_size, kernel_type)); } static std::vector get_internal_buffer_sizes(const sdpa_params& sdpa_params, size_t kernel_type) { @@ -130,6 +103,33 @@ static std::string GetKernelName(std::string base_name, KernelsTypes type, const return kernel_name; } +size_t SDPAKernelOpt::get_sg_number_scale_factor(const Params& params, size_t head_size, size_t kernel_type) { + const size_t optimal_scale_factor = 2; + if (kernel_type == KernelsTypes::MULTI_TOKENS) { + if (head_size * optimal_scale_factor <= params.engineInfo.maxWorkGroupSize) { + return optimal_scale_factor; + } + } else if (kernel_type == KernelsTypes::SINGLE_TOKEN) { + if (head_size * optimal_scale_factor <= params.engineInfo.maxWorkGroupSize && + head_size * optimal_scale_factor / subgroup_size <= subgroup_size) { + return optimal_scale_factor; + } + } + + return 1; +} + +size_t SDPAKernelOpt::get_seq_len_partition_size(const Params& params, size_t head_size, size_t kernel_type) { + size_t seq_len = 0; + if (kernel_type == KernelsTypes::MULTI_TOKENS) { + seq_len = head_size * get_sg_number_scale_factor(params, head_size, kernel_type); + } else { + seq_len = 256; + } + + return seq_len; +} + ParamsKey SDPAKernelOpt::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::INT8); @@ -176,14 +176,14 @@ JitConstants SDPAKernelOpt::GetJitConstants(const sdpa_params& params, size_t ke const auto& config = params.conf; jit.AddConstant(MakeJitConstant("SUBGROUP_SIZE", subgroup_size)); jit.AddConstant(MakeJitConstant("HEAD_SIZE", config.head_size)); - jit.AddConstant(MakeJitConstant("SEQ_LEN_PARTITION_SIZE", get_seq_len_partition_size(params, kernel_idx))); + jit.AddConstant(MakeJitConstant("SEQ_LEN_PARTITION_SIZE", get_seq_len_partition_size(params, config.head_size, kernel_idx))); auto target_seq_len_block_size = kernel_idx == KernelsTypes::SINGLE_TOKEN ? 1 : get_target_seq_len_block_size(); jit.AddConstant(MakeJitConstant("TARGET_SEQ_LEN_BLOCK_SIZE", target_seq_len_block_size)); auto sdpa_stage = kernel_idx == KernelsTypes::FINALIZATION ? 1 : 0; jit.AddConstant(MakeJitConstant("SDPA_STAGE_" + std::to_string(sdpa_stage), 1)); - jit.AddConstant(MakeJitConstant("SG_SCALE_FACTOR", get_sg_number_scale_factor(params, kernel_idx))); + jit.AddConstant(MakeJitConstant("SG_SCALE_FACTOR", get_sg_number_scale_factor(params, config.head_size, kernel_idx))); if (params.conf.is_paged_attention) { if (params.conf.has_alibi_input) { @@ -196,6 +196,10 @@ JitConstants SDPAKernelOpt::GetJitConstants(const sdpa_params& params, size_t ke } else { jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", 1)); } + + if (params.outputs.size() > 1) { + jit.AddConstant(MakeJitConstant("PAGED_ATTENTION_SCORES_OUTPUT", 1)); + } } else if (params.inputs.size() <= 4) { jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE_INV", std::sqrt(static_cast(params.conf.head_size)))); jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE", 1.0f / std::sqrt(static_cast(params.conf.head_size)))); @@ -218,11 +222,11 @@ CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t k if (params.conf.is_paged_attention) { OPENVINO_ASSERT(kernel_idx == KernelsTypes::MULTI_TOKENS); - const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx); const size_t heads_num = static_cast(params.conf.heads_num); + const size_t head_size = static_cast(params.conf.head_size); + const size_t sg_num_scale = get_sg_number_scale_factor(params, head_size, kernel_idx); const size_t target_seq_len_block_size = get_target_seq_len_block_size(); const size_t target_seq_len = static_cast(params.conf.paged_attention_aligned_seq_len); - const size_t head_size = static_cast(params.conf.head_size); dispatch_data.gws = { heads_num, CeilDiv(target_seq_len, target_seq_len_block_size), @@ -243,13 +247,13 @@ CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t k const size_t target_seq_len_block_size = kernel_idx == 1 ? get_target_seq_len_block_size() : 1; if (kernel_idx == KernelsTypes::SINGLE_TOKEN) { - const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx); + const size_t sg_num_scale = get_sg_number_scale_factor(params, head_size, kernel_idx); dispatch_data.gws = { batch_size * heads_num, CeilDiv(target_seq_len, target_seq_len_block_size), head_size * num_of_partitions * sg_num_scale }; dispatch_data.lws = { 1, 1, head_size * sg_num_scale }; } else if (kernel_idx == KernelsTypes::MULTI_TOKENS) { - const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx); + const size_t sg_num_scale = get_sg_number_scale_factor(params, head_size, kernel_idx); dispatch_data.gws = { batch_size * heads_num, CeilDiv(target_seq_len, target_seq_len_block_size), head_size * sg_num_scale }; @@ -317,7 +321,7 @@ KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const { false, inputs_num, GetFusedPrimitiveInputsCount(params), - static_cast(prim_params.outputs.size()), + 1 /* number_of_outputs */, prim_params.is_shape_agnostic); auto beam_table_idx = prim_params.inputs.size(); @@ -339,6 +343,19 @@ KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const { kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2}); + if (prim_params.conf.is_paged_attention && prim_params.outputs.size() > 1) { + // Intermediate buffers for PagedAttention scores calculation: + // softmax_results, subsequence_offsets, exp_sums, max_logits, tmp_out + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 3}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 5}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 7}); + + // Scalar used for proper offset calculation of intermediate data buffers + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); + } + const auto buf_sizes = get_internal_buffer_sizes(prim_params, kernel_idx); if (!prim_params.conf.is_paged_attention) { kd.internalBufferSizes.clear(); @@ -379,6 +396,15 @@ void SDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const { kernel_data.kernels[0].params.workGroups.global = dispatch_data.gws; kernel_data.kernels[0].params.workGroups.local = dispatch_data.lws; kernel_data.kernels[0].skip_execution = false; + + if (prim_params.outputs.size() > 1) { + const auto max_seq_len = prim_params.conf.paged_attention_max_len; + const auto seq_len_partition_size = get_seq_len_partition_size(params, prim_params.conf.head_size, KernelsTypes::MULTI_TOKENS); + + kernel_data.kernels[0].params.scalars.resize(1); + kernel_data.kernels[0].params.scalars[0].t = ScalarDescriptor::Types::UINT32; + kernel_data.kernels[0].params.scalars[0].v.u32 = static_cast(Align(max_seq_len, seq_len_partition_size)); + } } else { const auto num_of_partitions = get_partitions_num(prim_params, KernelsTypes::SINGLE_TOKEN); const auto buf_sizes = get_internal_buffer_sizes(prim_params, KernelsTypes::SINGLE_TOKEN); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h index 8d7279f5546112..a4d351498d7075 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h @@ -17,6 +17,9 @@ class SDPAKernelOpt : public SDPAKernelBase { KernelsPriority GetKernelsPriority(const Params& params) const override; ParamsKey GetSupportedKey() const override; + static size_t get_sg_number_scale_factor(const Params& params, size_t head_size, size_t kernel_type); + static size_t get_seq_len_partition_size(const Params& params, size_t head_size, size_t kernel_type); + protected: bool Validate(const Params& p) const override; void GetUpdateDispatchDataFunc(KernelData& kd) const override; diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp index cd8128baff37c9..e9fa5dd675629a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp @@ -21,16 +21,28 @@ namespace cache { primitive_db::primitive_db() : primitives({ #include "ks_primitive_db.inc" + }), + cm_primitives({ +#include "ks_cm_primitive_db.inc" }), batch_headers({ #include "ks_primitive_db_batch_headers.inc" + }), + cm_batch_headers({ +#include "ks_cm_primitive_db_batch_headers.inc" }) { } -std::vector primitive_db::get(const primitive_id& id) const { +std::vector primitive_db::get(const primitive_id& id, bool is_cm) const { #ifndef NDEBUG { - std::ifstream kernel_file{id + ".cl", std::ios::in | std::ios::binary}; + std::string filename = id; + if (!is_cm) { + filename += ".cl"; + } else { + filename += ".cpp"; + } + std::ifstream kernel_file{filename, std::ios::in | std::ios::binary}; if (kernel_file.is_open()) { code ret; auto beg = kernel_file.tellg(); @@ -46,7 +58,11 @@ std::vector primitive_db::get(const primitive_id& id) const { } #endif try { - const auto codes = primitives.equal_range(id); + auto* primitives_ptr = &primitives; + if (is_cm) { + primitives_ptr = &cm_primitives; + } + const auto codes = primitives_ptr->equal_range(id); std::vector temp; std::for_each(codes.first, codes.second, [&](const std::pair& c) { temp.push_back(c.second); diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h index e384f6c9879fb5..5c6987246ce1f4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h +++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h @@ -21,8 +21,9 @@ using primitive_id = std::string; struct primitive_db { primitive_db(); - std::vector get(const primitive_id& id) const; + std::vector get(const primitive_id& id, bool is_cm = false) const; std::map get_batch_headers() const { return std::move(batch_headers); } + std::map get_cm_batch_headers() const { return std::move(cm_batch_headers); } private: struct case_insensitive_compare { @@ -35,7 +36,9 @@ struct primitive_db { } }; std::multimap primitives; + std::multimap cm_primitives; std::map batch_headers; + std::map cm_batch_headers; }; } // namespace cache diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py index 116844f3bccfc7..393e67f3bdb6aa 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py +++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py @@ -6,21 +6,30 @@ # the trailing characters are a tag to allow multiple primitive implementations from __future__ import print_function +from enum import Enum import os import argparse import glob import ntpath import re -class OpenCL2CHeaders(object): +class KernelLang(Enum): + OCLC = 0 + CM = 1 + def header_extension(self): + return (".cl", ".h")[self.value] + def source_extension(self): + return (".cl", ".cpp")[self.value] +class Kernels2CHeaders(object): - def __init__(self, kernels_folder, out_path, out_file_name_prim_db, out_file_name_batch_headers): + def __init__(self, kernels_folder, out_path, out_file_name_prim_db, out_file_name_batch_headers, kernel_lang): self.kernels_folder = os.path.abspath(kernels_folder) self.out_path = os.path.abspath(out_path) self.out_file_name_prim_db = out_file_name_prim_db self.out_file_name_batch_headers = out_file_name_batch_headers self.include_files = {} self.batch_headers = [] + self.kernel_lang = kernel_lang self.find_and_set_batch_headers() # NOTE: batch_headers are headers with macros on which the runtime jitter might depend on. @@ -29,7 +38,7 @@ def __init__(self, kernels_folder, out_path, out_file_name_prim_db, out_file_nam # specially for improving the jit compilation performance, i.e., # they are not to be included in each kernel, but to be included only once at the beginning of each batch. def find_and_set_batch_headers(self): - batch_headers_list = [ntpath.basename(h) for h in glob.glob(os.path.join(self.kernels_folder, "include/batch_headers/*.cl"))] + batch_headers_list = [ntpath.basename(h) for h in glob.glob(os.path.join(self.kernels_folder, "include/batch_headers/*" + self.kernel_lang.header_extension()))] deps = {} for h in batch_headers_list: header_file = os.path.abspath(os.path.join(self.kernels_folder, "include/batch_headers", h)) @@ -56,11 +65,11 @@ def topological_sort(self, cur_key, items, stack, res): def convert(self): res = '// This file is autogenerated by primitive_db_gen.py, all changes to this file will be undone\n\n' - filelist = glob.glob(os.path.join(self.kernels_folder, "*.cl")) + filelist = glob.glob(os.path.join(self.kernels_folder, "*" + self.kernel_lang.source_extension())) for filename in filelist: #try: print('processing {}'.format(filename)) - res += self.cl_file_to_str(filename) + res += self.kernel_file_to_str(filename) #except: # pass out_file_name_prim_db = os.path.join(self.out_path, self.out_file_name_prim_db) @@ -198,8 +207,8 @@ def batch_headers_to_str(self): characters = 1 # Newline character above res = "" for h in self.batch_headers: - header_name = h[:h.find('.cl')] - res += '{{"{}",\n(std::string) R"(\n'.format(header_name) + header_name = h[:h.rfind('.')] + res += '{{"{}",\n(std::string) R"-(\n'.format(header_name) header_file = os.path.abspath(os.path.join(os.path.dirname(self.kernels_folder + "/include/batch_headers"), "batch_headers/" + h)) content = [] with open(header_file) as f: @@ -208,11 +217,11 @@ def batch_headers_to_str(self): if line.startswith('#include'): continue if (i + 1) % max_lines == 0 or characters + len(line) + 1 > max_characters: - res += ')"\n + (std::string) R"(' + res += ')-"\n + (std::string) R"-(' characters = 0 res += '{}\n'.format(line.rstrip()) characters += len(line) + 1 - res += ')"},\n\n' + res += ')-"},\n\n' return self.post_process_sources(res) def post_process_sources(self, content): @@ -241,10 +250,10 @@ def comment_replacer(match): return content - def cl_file_to_str(self, filename): + def kernel_file_to_str(self, filename): name = ntpath.basename(filename) self.include_files[filename] = {} - kernel_name = name[:name.find('.cl')] + kernel_name = name[:name.rfind('.')] res = '{{"{}",\n(std::string) R"__krnl(\n'.format(kernel_name) content = self.append_file_content(filename, filename) content += self.append_undefs(filename) @@ -265,16 +274,17 @@ def cl_file_to_str(self, filename): return res - def main(): ap = argparse.ArgumentParser() ap.add_argument('-kernels', required=True, metavar='PATH', help='The absolute path to OpenCL kernels folder') ap.add_argument('-out_path', required=True, metavar='PATH', help='The absolute path to dump file') ap.add_argument('-out_file_name_prim_db', required=True, metavar='PATH', help='dump file name') ap.add_argument('-out_file_name_batch_headers', required=True, metavar='PATH', help='dump file name') + ap.add_argument('-cm', required=False, action='store_true', help='Process CM kernel sources instead of ocl c') args = ap.parse_args() - converter = OpenCL2CHeaders(args.kernels, args.out_path, args.out_file_name_prim_db, args.out_file_name_batch_headers) + kernel_lang = KernelLang.CM if args.cm else KernelLang.OCLC + converter = Kernels2CHeaders(args.kernels, args.out_path, args.out_file_name_prim_db, args.out_file_name_batch_headers, kernel_lang) converter.convert() if __name__ == '__main__': diff --git a/src/plugins/intel_gpu/src/plugin/ops/fake_convert.cpp b/src/plugins/intel_gpu/src/plugin/ops/fake_convert.cpp new file mode 100644 index 00000000000000..282a483deab189 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/fake_convert.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/plugin/program_builder.hpp" +#include "intel_gpu/plugin/common_utils.hpp" + +#include "openvino/op/fake_convert.hpp" + +#include "intel_gpu/primitives/fake_convert.hpp" + +namespace ov { +namespace intel_gpu { +static void CreateFakeConvertOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {2, 3}); + const auto inputs = p.GetInputInfo(op); + const std::string layerName = layer_type_name_ID(op); + ov::element::Type destination_type = op->get_destination_element_type(); + std::shared_ptr fake_convert_prim = nullptr; + if (inputs.size() == 2) { + fake_convert_prim = std::make_shared(layerName, + inputs[0], + inputs[1], + destination_type); + } else { + fake_convert_prim = std::make_shared(layerName, + inputs[0], + inputs[1], + inputs[2], + destination_type); + } + + p.add_primitive(*op, fake_convert_prim); +} + +REGISTER_FACTORY_IMPL(v13, FakeConvert); + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp index 7425b096b6d324..d82d3a66fed7f7 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp @@ -61,10 +61,13 @@ static void CreatePagedAttentionExtensionOp(ProgramBuilder& p, const std::shared OPENVINO_ASSERT(alibi_const != nullptr); prim.has_alibi = ov::shape_size(alibi_const->get_output_shape(0)) > 0; + prim.num_outputs = 1; if (op->get_output_size() > 1) { const auto scores_output_idx = 1; const auto& users = op->get_output_target_inputs(scores_output_idx); - OPENVINO_ASSERT(users.size() == 0, "[GPU] PagedAttention implementation doesn't support scores output yet"); + if (users.size() > 0) { + prim.num_outputs++; // Add scores output + } } p.add_primitive(*op, prim); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 53ab9aa188b7aa..7c7c09adcd182f 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -92,6 +92,7 @@ #include "transformations/common_optimizations/lstm_cell_fusion.hpp" #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp" #include "transformations/common_optimizations/mvn_fusion.hpp" +#include "transformations/common_optimizations/sdpa_scale_fusion.hpp" #include "transformations/common_optimizations/softmax_fusion.hpp" #include "transformations/common_optimizations/glu_fusion.hpp" #include "transformations/common_optimizations/transpose_sinking.hpp" @@ -941,6 +942,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { if (!disable_horizontal_fc_fusion) manager.register_pass(); + manager.register_pass(); manager.register_pass(); auto pass_config = manager.get_pass_config(); manager.register_pass(); diff --git a/src/plugins/intel_gpu/src/runtime/layout.cpp b/src/plugins/intel_gpu/src/runtime/layout.cpp index a2b7e62ea0cae2..5c6c6dc83aeaea 100644 --- a/src/plugins/intel_gpu/src/runtime/layout.cpp +++ b/src/plugins/intel_gpu/src/runtime/layout.cpp @@ -446,8 +446,6 @@ bool layout::compatible(const layout& other) const { if (l1.is_dynamic() || l2.is_dynamic()) return false; - auto l1_size = l1.get_tensor(); - auto l2_size = l2.get_tensor(); if (l1 == l2) return true; if (check_redundant_1d_along_feature(l1, l2)) @@ -459,7 +457,7 @@ bool layout::compatible(const layout& other) const { if (format::is_default_format(l1.format) && format::is_default_format(l2.format) && !l1.data_padding && !l2.data_padding && l1.get_linear_size() == l2.get_linear_size()) return true; - if (l1_size != l2_size) + if (l1.get_shape() != l2.get_shape()) return false; if (l1.get_linear_size() != l2.get_linear_size()) return false; @@ -505,6 +503,19 @@ bool layout::compatible(const layout& other) const { auto l1_pitch = l1.get_pitches(); auto l2_pitch = l2.get_pitches(); + auto l1_padded_dims = l1.get_padded_dims(); + auto l2_padded_dims = l2.get_padded_dims(); + + // Ignore pitches which will never be used (for padded dims with size == 1) + for (size_t i = 0; i < l1_padded_dims.size(); ++i) { + if (l1_padded_dims[i] == 1) { + l1_pitch[i] = 0; + } + if (l2_padded_dims[i] == 1) { + l2_pitch[i] = 0; + } + } + auto l1_offset = l1.get_linear_offset(); auto l2_offset = l2.get_linear_offset(); if (l1_pitch == l2_pitch && l1_offset == l2_offset) diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/fake_convert.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/fake_convert.cpp new file mode 100644 index 00000000000000..d1236f5c524421 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/fake_convert.cpp @@ -0,0 +1,141 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/ov_tensor_utils.hpp" +#include "common_test_utils/file_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" + +#include "openvino/op/parameter.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/fake_convert.hpp" + +namespace { + +namespace fp8 { +constexpr float MAX_F8E4M3 = 448.f; +constexpr float MAX_F8E5M2 = 57344.f; +} // namespace fp8 + +using namespace std; +using namespace ov; +using namespace testing; +using ov::test::InputShape; + +using FakeConvertTestParams = std::tuple< + ov::Shape, // Input shapes + ov::Shape, // Scale shape + ov::Shape, // Shift shape + ov::element::Type, // input precision + ov::element::Type, // destination type + std::string >; // device name + +class FakeConvertTest : public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseStaticTest { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + ov::Shape input_shape; + ov::Shape scale_shape; + ov::Shape shift_shape; + ov::element::Type prec; + ov::element::Type destination_type; + std::string target_device; + + std::tie(input_shape, scale_shape, shift_shape, prec, destination_type, target_device) = obj.param; + + std::ostringstream result; + result << "IS=("; + result << ov::test::utils::vec2str(input_shape) << "_"; + result << "scale_shape=" << ov::test::utils::vec2str(scale_shape) << "_"; + result << "shift_shape=" << ov::test::utils::vec2str(shift_shape) << "_"; + result << "input_precision=" << prec << "_"; + result << "destination_type=" << destination_type << "_"; + result << "device_type=" << target_device; + return result.str(); + } + +protected: + ov::Shape input_shape, scale_shape, shift_shape; + ov::element::Type destination_type; + + void SetUp() override { + ov::element::Type prec; + std::tie(input_shape, scale_shape, shift_shape, prec, destination_type, targetDevice) = GetParam(); + const float MAX_FP8 = (destination_type == ov::element::f8e4m3) ? fp8::MAX_F8E4M3 : fp8::MAX_F8E5M2; + if (shift_shape.empty()) { + auto data = make_shared(prec, input_shape); + auto scale = op::v0::Constant::create(prec, + scale_shape, + {MAX_FP8 / (MAX_FP8 / 2.f), + 1.0f, + MAX_FP8 / (MAX_FP8 * 3.5f), + MAX_FP8 / (MAX_FP8 * 4.f)}); + + auto op = make_shared(data, scale, destination_type); + + function = make_shared(OutputVector{op}, ParameterVector{data}); + } else { + auto data = make_shared(prec, input_shape); + auto scale = op::v0::Constant::create(prec, + scale_shape, + {MAX_FP8 / (MAX_FP8 / 2.f), + 1.0f, + MAX_FP8 / (MAX_FP8 * 3.5f), + MAX_FP8 / (MAX_FP8 * 4.f)}); + auto shift = op::v0::Constant::create(prec, shift_shape, {0.f, 0.f, 0.f, 0.f}); + + auto op = make_shared(data, scale, shift, destination_type); + + function = make_shared(OutputVector{op}, ParameterVector{data}); + } + } + + void generate_inputs(const std::vector& target_shapes) override { + inputs.clear(); + const float MAX_FP8 = (destination_type == ov::element::f8e4m3) ? fp8::MAX_F8E4M3 : fp8::MAX_F8E5M2; + const auto& func_inputs = function->inputs(); + auto& data_input = func_inputs[0]; + ov::Tensor tensor = ov::Tensor(data_input.get_element_type(), target_shapes[0]); + std::vector input_data{MAX_FP8 / 4.f, + MAX_FP8 / 3.f, + MAX_FP8 / 2.f, + MAX_FP8, + MAX_FP8, + MAX_FP8, + MAX_FP8 * 1.2f, + MAX_FP8 * 2.3f, + MAX_FP8 * 3.4f, + MAX_FP8 * 2.f, + MAX_FP8 * 3.f, + MAX_FP8 * 4.f}; + auto* data_ptr = tensor.data(); + for (size_t i = 0; i < input_data.size(); i++) { + data_ptr[i] = input_data[i]; + } + inputs.insert({data_input.get_node_shared_ptr(), tensor}); + } +}; + +TEST_P(FakeConvertTest, Inference) { + run(); +} + +const std::vector input_precisions = {ov::element::f32}; + +const std::vector input_shapes = {{4, 3}}; + +const ov::Shape scale_shape = {4, 1}; +const std::vector shift_shapes = {{4, 1}, {}}; +const std::vector destination_types = {ov::element::f8e4m3, ov::element::f8e5m2}; + +INSTANTIATE_TEST_SUITE_P(Smoke_FakeConvertTest, + FakeConvertTest, + ::testing::Combine(::testing::ValuesIn(input_shapes), + ::testing::Values(scale_shape), + ::testing::ValuesIn(shift_shapes), + ::testing::ValuesIn(input_precisions), + ::testing::ValuesIn(destination_types), + ::testing::Values(ov::test::utils::DEVICE_GPU)), + FakeConvertTest::getTestCaseName); +} // namespace diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp index a16cd20846a1c7..5dfc450e43905a 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp @@ -85,6 +85,7 @@ #include "intel_gpu/primitives/swiglu.hpp" #include "intel_gpu/primitives/tile.hpp" #include "intel_gpu/primitives/unique.hpp" +#include "intel_gpu/primitives/fake_convert.hpp" #include "primitive_inst.h" #include "test_utils.h" @@ -226,5 +227,6 @@ TEST(registry_test, no_null_impls) { cldnn::unique_count, cldnn::unique_gather, cldnn::scaled_dot_product_attention, - cldnn::rope>(); + cldnn::rope, + cldnn::fake_convert>(); } diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp index 7c666819176a13..279a86c73f55bf 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp @@ -261,6 +261,10 @@ INSTANTIATE_TEST_SUITE_P(smoke, layout_cmp_test, layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::is_os_zyx_isv16_osv16}, false, false}, {layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::goiyx}, layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::gioyx}, false, false}, + {layout{ov::PartialShape{4, 1, 16, 16}, data_types::f16, format::bfyx}, + layout{ov::PartialShape{4, 1, 16, 16}, data_types::f16, format::byxf}, false, true}, + {layout{ov::PartialShape{2, 1, 2, 4}, data_types::f16, format::bfyx, padding({0, 0, 1, 0}, {0, 0, 1, 0})}, + layout{ov::PartialShape{2, 1, 2, 4}, data_types::f16, format::bfyx, padding({0, 1, 0, 0}, {0, 0, 0, 0})}, false, false}, })); struct layouts_transform_test_params { diff --git a/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp index 9a4cb71450a53c..0eb425b4dc1119 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp @@ -192,9 +192,9 @@ TEST(add_required_reorders, skip_adding_reorder_batch_axis_padding) { crop_prim = network.get_primitive("crop2"); ASSERT_EQ(crop_prim->can_be_optimized(), true); auto reorder_prim = network.get_primitive("crop1_reorder"); - ASSERT_EQ(reorder_prim->can_be_optimized(), true); + ASSERT_EQ(reorder_prim->can_be_optimized(), false); reorder_prim = network.get_primitive("crop2_reorder"); - ASSERT_EQ(reorder_prim->can_be_optimized(), true); + ASSERT_EQ(reorder_prim->can_be_optimized(), false); auto concate = network.get_primitive("concat"); ASSERT_EQ(concate->can_be_optimized(), false); } diff --git a/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp index 493ab79bf8e2cb..ee4382e51645cd 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp @@ -318,3 +318,108 @@ TEST(mark_shape_of_subgraphs, gather_compressed_no_mark) { ASSERT_FALSE(check_subgraph(prog->get_node("shape_of"), prog->get_node("gather_compressed"))); ASSERT_FALSE(check_subgraph(prog->get_node("shape_of"), prog->get_node("concat"))); } + +TEST(mark_shape_of_subgraphs, broadcast_not_existed_after_shapeof) { + auto& engine = get_test_engine(); + auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()}, + data_types::f32, format::bfyx}; + auto data_0 = engine.allocate_memory({ ov::PartialShape{4}, data_types::i32, format::bfyx }); + set_values(data_0, {1, 4, 1, 1}); + auto weights = engine.allocate_memory({ data_types::f16, format::bfyx, {1152, 4, 1, 1} }); + + topology topology; + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(data("data_0", data_0)); + topology.add(data("weights", weights)); + topology.add(shape_of("shape_of", input_info("input"), data_types::i32)); + topology.add(reshape("reshape", input_info("shape_of"), input_info("data_0"), false, {})); + topology.add(convolution("convolution", input_info("reshape"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + + auto prog = network.get_program(); + ASSERT_NE(prog, nullptr); + + ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("convolution"))); +} + +TEST(mark_shape_of_subgraphs, broadcast_w_data_and_direct_shapeof_no_mark) { + auto& engine = get_test_engine(); + auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()}, + data_types::f32, format::bfyx}; + auto data_0 = engine.allocate_memory({ ov::PartialShape{1}, data_types::i32, format::bfyx }); + set_values(data_0, {0}); + auto weights = engine.allocate_memory({ data_types::f16, format::bfyx, {1152, 4, 2, 2} }); + + topology topology; + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(data("data_0", data_0)); + topology.add(shape_of("shape_of", input_info("input"), data_types::i32)); + topology.add(broadcast("broadcast", input_info("data_0"), input_info("shape_of"), {}, ov::op::BroadcastType::BIDIRECTIONAL)); + topology.add(data("weights", weights)); + topology.add(convolution("convolution", input_info("broadcast"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + + auto prog = network.get_program(); + ASSERT_NE(prog, nullptr); + + ASSERT_FALSE(check_subgraph(prog->get_node("shape_of"), prog->get_node("convolution"))); + ASSERT_FALSE(check_subgraph(prog->get_node("shape_of"), prog->get_node("broadcast"))); +} + +TEST(mark_shape_of_subgraphs, broadcast_w_data_and_indirect_shapeof) { + auto& engine = get_test_engine(); + auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()}, + data_types::f32, format::bfyx}; + auto data_0 = engine.allocate_memory({ ov::PartialShape{1}, data_types::i32, format::bfyx }); + set_values(data_0, {0}); + + topology topology; + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(data("data_0", data_0)); + topology.add(shape_of("shape_of", input_info("input"), data_types::i32)); + topology.add(gather("gather", input_info("shape_of"), input_info("data_0"), 0, 0, {})); + topology.add(broadcast("broadcast", input_info("data_0"), input_info("gather"), {}, ov::op::BroadcastType::BIDIRECTIONAL)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + + auto prog = network.get_program(); + ASSERT_NE(prog, nullptr); + + ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("broadcast"))); +} + +TEST(mark_shape_of_subgraphs, broadcast_w_direct_shapeof_and_data) { + auto& engine = get_test_engine(); + auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()}, + data_types::f32, format::bfyx}; + auto target_shape = engine.allocate_memory({ ov::PartialShape{4}, data_types::i32, format::bfyx }); + set_values(target_shape, {4, 4, 1, 1}); + + topology topology; + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(data("target_shape", target_shape)); + topology.add(shape_of("shape_of", input_info("input"), data_types::i32)); + topology.add(broadcast("broadcast", input_info("shape_of"), input_info("target_shape"), {}, ov::op::BroadcastType::BIDIRECTIONAL)); + topology.add(reshape("reshape", input_info("input"), input_info("broadcast"), false, ov::PartialShape{4, 4, 1, 1})); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + + auto prog = network.get_program(); + ASSERT_NE(prog, nullptr); + + ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("broadcast"))); +} diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp index 456fab4ae0286a..1eb11c662608e0 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp @@ -1224,7 +1224,7 @@ TEST(prepare_buffer_fusing, test_implicit_crop_and_outerpadding) { auto reorder_prim = network.get_primitive("gather1_reorder"); ASSERT_EQ(reorder_prim->can_be_optimized(), true); reorder_prim = network.get_primitive("gather2_reorder"); - ASSERT_EQ(reorder_prim->can_be_optimized(), true); + ASSERT_EQ(reorder_prim->can_be_optimized(), false); auto reshape_prim = network.get_primitive("reshape1"); ASSERT_EQ(reshape_prim->can_be_optimized(), true); } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp index f0243f055c3670..13934020bfdf66 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp @@ -10820,7 +10820,14 @@ TEST_P(conv_dyn_test, convolution_gpu_fsv16_1x1_no_bias) { return outputs_ref.at("conv").get_memory(); }; - auto in_layout = layout{ov::PartialShape{ov::Dimension(), ov::Dimension(p.in_shape[1]), ov::Dimension(), ov::Dimension()}, data_types::f16, format::b_fs_yx_fsv16}; + cldnn::layout in_layout; + if (p.in_shape[2] % 2 == 0) { + // input feature is static + in_layout = layout{ov::PartialShape{ov::Dimension(), ov::Dimension(p.in_shape[1]), ov::Dimension(), ov::Dimension()}, data_types::f16, format::b_fs_yx_fsv16}; + } else { + // input feature is dynamic + in_layout = layout{ov::PartialShape{ov::Dimension(), ov::Dimension(), ov::Dimension(), ov::Dimension()}, data_types::f16, format::b_fs_yx_fsv16}; + } auto input = engine.allocate_memory({ p.in_shape, data_types::f16, format::b_fs_yx_fsv16 }); auto weights = engine.allocate_memory({p.wei_shape, data_types::f16, is_grouped ? format::bfzyx : format::bfyx}); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index f59dc5c42cffc1..5bc7e403d3bf74 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -4137,6 +4137,10 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input this->test_compressed_int4_scale_dyn_quan(false, true, 511, true); } +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_batch_1) { + this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 1, 2048, 3072); +} + TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_edge_case) { this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 359, 1536, 2560); } @@ -4827,3 +4831,52 @@ TEST_F(fully_connected_gpu_tests, weights_reorder_shapes_update) { TEST_F(fully_connected_gpu_tests, weights_reorder_shapes_update_cached) { this->test_weights_reorder_shapes_update(true); } + +TEST(fully_connected_gpu, cm) { + int min_random = -2, max_random = 2; + auto& engine = get_test_engine(); + ExecutionConfig config = get_test_default_config(engine); + + if (!cldnn::check_cm_jit_support(engine, config)) { + GTEST_SKIP(); + } + + // Test parameters + const int batch_num = 2; + const int output_f = 4; + const int input_x = 1; + const int input_y = 1; + const int input_f = 3; + + // Allocate memory + auto input_prim = engine.allocate_memory({ data_types::f16, format::bfyx, { batch_num, input_f, input_y, input_x } }); + auto weights_prim = engine.allocate_memory({ data_types::f16, format::oiyx, { output_f, input_f, input_y, input_x } }); + auto bias_prim = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 1, output_f, 1 } }); + + // Generate random input data and set values + tests::random_generator rg(GET_SUITE_NAME); + auto input_data = rg.generate_random_4d(batch_num, input_f, input_y, input_x, min_random, max_random); + auto weights_data = rg.generate_random_4d(output_f, input_f, input_y, input_x, min_random, max_random); + auto bias_data = rg.generate_random_1d(output_f, min_random, max_random); + + auto input_data_bfyx = flatten_4d(format::bfyx, input_data); + auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data); + set_values(input_prim, input_data_bfyx); + set_values(weights_prim, weights_data_bfyx); + set_values(bias_prim, bias_data); + topology topology( + input_layout("input", input_prim->get_layout()), + data("weights", weights_prim), + data("bias", bias_prim), + fully_connected("fc_prim", input_info("input"), "weights", "bias") + ); + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "", impl_types::cm }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); + network network(engine, topology, config); + network.set_input_data("input", input_prim); + auto outputs = network.execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + // Do not validate output for CM +} diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/paged_attention_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/paged_attention_gpu_test.cpp new file mode 100644 index 00000000000000..a32ef3325cd9bc --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/test_cases/paged_attention_gpu_test.cpp @@ -0,0 +1,687 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "random_generator.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cldnn; +using namespace ov::intel_gpu; +using namespace ::tests; + +/* +* PagedAttention inputs: +* [0]: query +* shape: [batch_size_in_tokens, num_heads * head_size], type: f16 +* [1]: key +* shape: [batch_size_in_tokens, num_kv_heads * head_size], type: f16 +* [2]: value  +* shape: [batch_size_in_tokens, num_kv_heads * head_size], type: f16 +* [3]: key_cache +* shape: [num_blocks, num_kv_heads, head_size, block_size], type: f16 +* [4]: value_cache +* shape: [num_blocks, num_kv_heads, block_size, head_size], type: f16 +* [5]: past_lens +* shape: [batch_size_in_sequences], type: i32 +* [6]: subsequence_begins +* shape: [batch_size_in_sequences + 1], type: i32 +* [7]: block_indices +* Shape: [num_blocks], type: i32 +* [8]: block_indices_begins +* Shape: [batch_size_in_sequences + 1], type: i32 +* [9]: scale, optional +* [10]: sliding_window, optional +* [11]: alibi_slopes, optional +* [12]: max_context_len +* shape: [], type: i32 +*/ + +struct SubsequenceDescriptor { + int num_tokens; + int past_len; +}; + +struct PagedAttentionManager { + int num_heads; + int head_size; + int block_size; + std::vector subsequence_descs; + + // per-subsequence QKV inputs + std::vector> query_data; // {[1, num_tokens, num_heads, head_size], ..} + std::vector> key_data; // {[1, past_len + num_tokens, num_heads, head_size], ..} + std::vector> value_data; // {[1, past_len + num_tokens, num_heads, head_size], ..} + + // common PA inputs + std::vector past_lens; + std::vector subsequence_begins; + std::vector block_indices; + std::vector block_indices_begins; + std::vector max_context_len; + + cldnn::engine& test_engine; + cldnn::stream& test_stream; + tests::random_generator& rg; + + PagedAttentionManager(tests::random_generator& rg, + cldnn::engine& engine, + cldnn::stream& stream, + const std::vector& subsequence_descs, + int num_heads, + int head_size, + int block_size) + : num_heads(num_heads) + , head_size(head_size) + , block_size(block_size) + , subsequence_descs(subsequence_descs) + , test_engine(engine) + , test_stream(stream) + , rg(rg) { + // init subsequence_begins and block_indices_begins + subsequence_begins.push_back(0); + block_indices_begins.push_back(0); + + int max_len = 0; + for (int i = 0; i < static_cast(subsequence_descs.size()); i++) { + const auto& subsequence_desc = subsequence_descs[i]; + max_len = std::max(max_len, subsequence_desc.num_tokens + subsequence_desc.past_len); + + query_data.push_back(generate_input_data(rg, num_heads, subsequence_desc.num_tokens, head_size)); + key_data.push_back(generate_input_data(rg, num_heads, subsequence_desc.num_tokens + subsequence_desc.past_len, head_size)); + value_data.push_back(generate_input_data(rg, num_heads, subsequence_desc.num_tokens + subsequence_desc.past_len, head_size)); + + past_lens.push_back(subsequence_desc.past_len); + int subsequence_start_pos = subsequence_begins[i]; + int subsequence_end_pos = subsequence_start_pos + subsequence_desc.num_tokens; + subsequence_begins.push_back(subsequence_end_pos); + + int subsequence_length = subsequence_desc.num_tokens + subsequence_desc.past_len; + int required_blocks = ceil_div(subsequence_length, block_size); + int start_block_idx = block_indices.empty() ? 0 : block_indices.back() + 1; + int end_block_idx = start_block_idx + required_blocks; + for (int block_idx = start_block_idx; block_idx < end_block_idx; block_idx++) { + block_indices.push_back(block_idx); + } + + int block_indices_start_pos = block_indices_begins[i]; + int block_indices_end_pos = block_indices_start_pos + required_blocks; + block_indices_begins.push_back(block_indices_end_pos); + } + max_context_len.push_back(max_len); + } + + memory::ptr get_query_memory() { + return get_QKV_memory(query_data, false); + } + + memory::ptr get_key_memory() { + return get_QKV_memory(key_data, true); + } + + memory::ptr get_value_memory() { + return get_QKV_memory(value_data, true); + } + + memory::ptr get_key_cache_memory() { + auto num_blocks = block_indices.back() + 1; + auto key_cache_shape = ov::PartialShape{ num_blocks, num_heads, head_size, block_size }; + auto key_cache_layout = layout{ key_cache_shape, data_types::f16, format::bfyx }; + auto memory = test_engine.allocate_memory(key_cache_layout); + + for (int i = 0; i < static_cast(subsequence_descs.size()); i++) { + int past_len = subsequence_descs[i].past_len; + if (past_len != 0) { + int blocks_num = ceil_div(past_len, block_size); + int start_block_idx = block_indices[block_indices_begins[i]]; + for (int block_idx = 0; block_idx < blocks_num; block_idx++) { + int last_token_idx = block_idx == blocks_num - 1 ? past_len % block_size + : block_size; + for (int token_idx = 0; token_idx < last_token_idx; token_idx++) { + for (int head_idx = 0; head_idx < num_heads; head_idx++) { + for (int head_size_idx = 0; head_size_idx < head_size; head_size_idx++) { + size_t input_token_offset = block_idx * block_size + token_idx; + ov::float16* data_ptr = key_data[i].data() + + input_token_offset * num_heads * head_size + + head_idx * head_size + head_size_idx; + + // shape: [num_blocks, num_heads, head_size, block_size] + size_t output_offset = (start_block_idx + block_idx) * num_heads * head_size * block_size + + head_idx * head_size * block_size + + head_size_idx * block_size + + token_idx; + + set_values(test_stream, memory, data_ptr, 1, output_offset); + } + } + } + } + } + } + + return memory; + } + + memory::ptr get_value_cache_memory() { + auto num_blocks = block_indices.back() + 1; + auto value_cache_shape = ov::PartialShape{ num_blocks, num_heads, block_size, head_size }; + auto value_cache_layout = layout{ value_cache_shape, data_types::f16, format::bfyx }; + auto memory = test_engine.allocate_memory(value_cache_layout); + + for (int i = 0; i < static_cast(subsequence_descs.size()); i++) { + int past_len = subsequence_descs[i].past_len; + if (past_len != 0) { + int blocks_num = ceil_div(past_len, block_size); + int start_block_idx = block_indices[block_indices_begins[i]]; + for (int block_idx = 0; block_idx < blocks_num; block_idx++) { + int last_token_idx = block_idx == blocks_num - 1 ? past_len % block_size + : block_size; + for (int token_idx = 0; token_idx < last_token_idx; token_idx++) { + for (int head_idx = 0; head_idx < num_heads; head_idx++) { + size_t input_token_offset = block_idx * block_size + token_idx; + ov::float16* data_ptr = value_data[i].data() + + input_token_offset * num_heads * head_size + + head_idx * head_size; + + // shape: [num_blocks, num_heads, block_size, head_size] + size_t output_offset = (start_block_idx + block_idx) * num_heads * block_size * head_size + + head_idx * block_size * head_size + + token_idx * head_size; + + set_values(test_stream, memory, data_ptr, head_size, output_offset); + } + } + } + } + } + + return memory; + } + + memory::ptr get_past_lens_memory() { + return get_memory_from_vec(past_lens); + } + + memory::ptr get_subsequence_begins_memory() { + return get_memory_from_vec(subsequence_begins); + } + + memory::ptr get_block_indices_memory() { + return get_memory_from_vec(block_indices); + } + + memory::ptr get_block_indices_begins_memory() { + return get_memory_from_vec(block_indices_begins); + } + + memory::ptr get_scale_memory() { + std::vector scale = { ov::float16(get_default_scale()) }; + return get_memory_from_vec(scale); + } + + memory::ptr get_sliding_window_memory() { + std::vector sliding_window = { 0 }; + return get_memory_from_vec(sliding_window); + } + + memory::ptr get_alibi_memory() { + std::vector alibi; + return get_memory_from_vec(alibi); + } + + memory::ptr get_max_context_len_memory() { + return get_memory_from_vec(max_context_len); + } + + float get_default_scale() { + return static_cast(1.f / std::sqrt(head_size)); + } + +private: + template + memory::ptr get_memory_from_vec(std::vector& input_data) { + auto data_size = input_data.empty() ? 1 : input_data.size(); + auto shape = ov::PartialShape{ static_cast(data_size) }; + auto layout = cldnn::layout{ shape, ov::element::from(), format::bfyx }; + auto memory = test_engine.allocate_memory(layout); + + if (input_data.empty()) { + auto shape = ov::PartialShape{0}; + auto layout = cldnn::layout{ shape, ov::element::from(), format::bfyx }; + return test_engine.reinterpret_buffer(*memory, layout); + } + + set_values(test_stream, memory, input_data.data(), input_data.size(), 0); + + return memory; + } + + memory::ptr get_QKV_memory(std::vector>& input_data, bool skip_past_len) { + int total_tokens = 0; + for (const auto& subsequence_desc : subsequence_descs) + total_tokens += subsequence_desc.num_tokens; + + auto query_shape = ov::PartialShape{ total_tokens, num_heads * head_size }; + auto query_layout = layout{ query_shape, data_types::f16, format::bfyx }; + auto memory = test_engine.allocate_memory(query_layout); + + for (int subsequence_idx = 0; subsequence_idx < static_cast(subsequence_descs.size()); subsequence_idx++) { + for (int token_idx = 0; token_idx < subsequence_descs[subsequence_idx].num_tokens; token_idx++) { + for (int head_idx = 0; head_idx < num_heads; head_idx++) { + size_t input_token_offset = token_idx; + // as generated data stored in vectors includes past_len, ignore it for KV inputs + if (skip_past_len) + input_token_offset += subsequence_descs[subsequence_idx].past_len; + + ov::float16* data_ptr = input_data[subsequence_idx].data() + + input_token_offset * num_heads * head_size + + head_idx * head_size; + + size_t output_token_offset = subsequence_begins[subsequence_idx] + token_idx; + size_t output_offset = output_token_offset * num_heads * head_size + + head_idx * head_size; + + set_values(test_stream, memory, data_ptr, head_size, output_offset); + } + } + } + + return memory; + } + + template + static void set_values(stream& stream, memory::ptr mem, T* vals, size_t size, size_t dst_offset) { + mem_lock mem_ptr(mem, stream); + for (size_t i = 0; i < size; i++) { + mem_ptr[dst_offset + i] = vals[i]; + } + } + + static std::vector generate_input_data(tests::random_generator& rg, size_t num_heads, size_t tokens_num, size_t head_size) { + const size_t total_elements_num = tokens_num * num_heads * head_size; + auto data = rg.generate_random_1d(total_elements_num, -1, 1); + + return data; + } +}; + +struct PagedAttentionReference { + PagedAttentionReference(PagedAttentionManager& pam) + : pam(pam) + , test_engine(pam.test_engine) + , test_stream(pam.test_stream) {} + + std::pair, std::vector> get_reference() { + std::vector ref_data_output; + std::vector ref_scores_output; + + for (size_t i = 0; i < pam.subsequence_descs.size(); i++) { + const auto& subsequence_desc = pam.subsequence_descs[i]; + const auto kv_seq_len = subsequence_desc.num_tokens + subsequence_desc.past_len; + auto subsequence_ref_results = run_reference(pam.query_data[i], + pam.key_data[i], + pam.value_data[i], + subsequence_desc.num_tokens, + kv_seq_len, + pam.num_heads, + pam.head_size, + pam.get_default_scale()); + + // concatenate all subsequences into one vector + ref_data_output.insert(ref_data_output.end(), + subsequence_ref_results.first.begin(), + subsequence_ref_results.first.end()); + ref_scores_output.insert(ref_scores_output.end(), + subsequence_ref_results.second.begin(), + subsequence_ref_results.second.end()); + } + + return { ref_data_output, ref_scores_output }; + } + +private: + std::pair, std::vector> + run_reference(const std::vector& query_data, + const std::vector& key_data, + const std::vector& value_data, + int num_queries, + int num_keys, + int num_heads, + int head_size, + float scale) { + auto query_shape = ov::PartialShape{1, num_queries, num_heads, head_size}; + auto key_shape = ov::PartialShape{1, num_keys, num_heads, head_size}; + auto value_shape = ov::PartialShape{1, num_keys, num_heads, head_size}; + + auto query_layout = layout{query_shape, data_types::f16, format::bfyx}; + auto key_layout = layout{key_shape, data_types::f16, format::bfyx}; + auto value_layout = layout{value_shape, data_types::f16, format::bfyx}; + + OPENVINO_ASSERT(query_layout.count() == query_data.size()); + OPENVINO_ASSERT(key_layout.count() == key_data.size()); + OPENVINO_ASSERT(value_layout.count() == value_data.size()); + + auto query_mem = test_engine.allocate_memory(query_layout); + auto key_mem = test_engine.allocate_memory(key_layout); + auto value_mem = test_engine.allocate_memory(value_layout); + auto mask_mem = get_mask_mem(num_queries, num_keys, num_heads); + + set_values(query_mem, query_data); + set_values(key_mem, key_data); + set_values(value_mem, value_data); + + topology topology; + topology.add(input_layout("query", query_layout), + input_layout("key", key_layout), + input_layout("value", value_layout), + data("mask", mask_mem), + permute("query_transposed", input_info("query"), {0, 2, 1, 3}), + permute("key_transposed", input_info("key"), {0, 2, 1, 3}), + permute("value_transposed", input_info("value"), {0, 2, 1, 3}), + gemm("qk_gemm", { input_info("query_transposed"), input_info("key_transposed") }, data_types::f16, false, true, scale), + eltwise("eltwise", { input_info("qk_gemm"), input_info("mask") }, eltwise_mode::sum), + softmax("softmax", input_info("eltwise"), -1), + gemm("qkv_gemm", { input_info("softmax"), input_info("value_transposed") }, data_types::f16, false, false), + permute("qkv_gemm_transposed", input_info("qkv_gemm"), {0, 2, 1, 3}), + reorder("output_data", input_info("qkv_gemm_transposed"), format::bfyx, data_types::f16), + reorder("scores_data", input_info("softmax"), format::bfyx, data_types::f16) + ); + + ExecutionConfig config = get_test_default_config(test_engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + network::ptr network = get_network(test_engine, topology, config, get_test_stream_ptr(), false); + network->set_input_data("query", query_mem); + network->set_input_data("key", key_mem); + network->set_input_data("value", value_mem); + + auto outputs = network->execute(); + + auto output_data_mem = outputs.at("output_data").get_memory(); + auto output_scores_mem = outputs.at("scores_data").get_memory(); + + return { get_output_data_vec(output_data_mem, num_queries, head_size, num_heads), + get_output_scores_vec(output_scores_mem, num_queries, num_keys, num_heads) }; + } + + std::vector get_output_scores_vec(memory::ptr scores_output, + int num_queries, + int num_keys, + int num_heads) { + OPENVINO_ASSERT(scores_output->count() == static_cast(num_heads * num_queries * num_keys)); + + std::vector output_scores(num_keys, 0); + mem_lock mem_ptr(scores_output, test_stream); + for (int head_idx = 0; head_idx < num_heads; head_idx++) { + for (int score_idx = 0; score_idx < num_keys; score_idx++) { + output_scores[score_idx] += mem_ptr[head_idx * num_queries * num_keys + + (num_queries - 1) * num_keys + + score_idx]; + } + } + + return output_scores; + } + + std::vector get_output_data_vec(memory::ptr data_output, + int num_queries, + int head_size, + int num_heads) { + OPENVINO_ASSERT(data_output->count() == static_cast(num_queries * num_heads * head_size)); + + std::vector output_data(data_output->count()); + mem_lock mem_ptr(data_output, test_stream); + for (size_t i = 0; i < data_output->count(); i++) + output_data[i] = mem_ptr[i]; + + return output_data; + } + + memory::ptr get_mask_mem(int num_queries, int num_keys, int num_heads) { + /* + * Two kinds of masks: + * + * Case 1 (N == K): + * num_queries = N + * num_keys = K = N + * head_size = H + * Q [N, H] * K[H, N] + * QK [N, N] + * 0 1 N + * 0 [ 0, MIN, .., MIN ] + * 1 [ 0, 0, .., MIN ] + * [ .., .., .., MIN ] + * N [ 0, 0, .., 0 ] + * + * Case 2 (N != K): + * num_queries = N + * num_keys = K + * head_size = H + * past_len = P = K - N + 1 + * Q [N, H] * K[H, K] + * QK [N, K] + * 0 1 2 P .. K + * 0 [ 0, 0, 0, MIN, MIN, MIN ] + * 1 [ 0, 0, 0, 0, MIN, MIN ] + * [ .., .., .., .., .., MIN ] + * N [ 0, 0, 0, 0, .., 0 ] + * + * Shapes: + * Q [1, num_heads, num_queries, head_size] + * K [1, num_heads, head_size, num_keys] + * Q*K [1, num_heads, num_queries, num_keys] + */ + + auto mask_shape = ov::PartialShape{ 1, 1, num_queries, num_keys }; + auto mask_layout = layout{mask_shape, data_types::f16, format::bfyx}; + auto mask_mem = test_engine.allocate_memory(mask_layout); + + int past_len = num_keys - num_queries + 1; + mem_lock mem_ptr(mask_mem, test_stream); + for (int i = 0; i < num_queries; i++) { + for (int j = 0; j < num_keys; j++) { + mem_ptr[i * num_keys + j] = j >= past_len + i ? std::numeric_limits::lowest() + : ov::float16(0.f); + } + } + + return mask_mem; + } + + + PagedAttentionManager& pam; + cldnn::engine& test_engine; + cldnn::stream& test_stream; +}; + +template +struct PagedAttentionTest : public ::testing::TestWithParam { +public: + random_generator rg; + cldnn::engine& engine = get_test_engine(); + float tolerance = 2e-3; + + void SetUp() override { + rg.set_seed(GET_SUITE_NAME); + } + + void execute(T& p) { + PagedAttentionManager pam(rg, get_test_engine(), get_test_stream(), p.subsequences, p.num_heads, p.head_size, p.block_size); + + auto query_mem = pam.get_query_memory(); + auto key_mem = pam.get_key_memory(); + auto value_mem = pam.get_value_memory(); + + auto key_cache_mem = pam.get_key_cache_memory(); + auto value_cache_mem = pam.get_value_cache_memory(); + + auto past_lens_mem = pam.get_past_lens_memory(); + auto subsequence_begins_mem = pam.get_subsequence_begins_memory(); + auto block_indices_mem = pam.get_block_indices_memory(); + auto block_indices_begins_mem = pam.get_block_indices_begins_memory(); + + auto scale_mem = pam.get_scale_memory(); + auto sliding_window_mem = pam.get_sliding_window_memory(); + auto alibi_mem = pam.get_alibi_memory(); + auto max_context_len_mem = pam.get_max_context_len_memory(); + + auto query_layout = query_mem->get_layout(); + auto key_layout = key_mem->get_layout(); + auto value_layout = value_mem->get_layout(); + auto key_cache_layout = key_cache_mem->get_layout(); + auto value_cache_layout = value_cache_mem->get_layout(); + auto past_lens_layout = past_lens_mem->get_layout(); + auto subsequence_begins_layout = subsequence_begins_mem->get_layout(); + auto block_indices_layout = block_indices_mem->get_layout(); + auto block_indices_begins_layout = block_indices_begins_mem->get_layout(); + auto scale_layout = scale_mem->get_layout(); + auto sliding_window_layout = sliding_window_mem->get_layout(); + auto alibi_layout = alibi_mem->get_layout(); + auto max_context_len_layout = max_context_len_mem->get_layout(); + + // make layouts dynamic + query_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads * p.head_size }); + key_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads * p.head_size }); + value_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads * p.head_size }); + key_cache_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads, p.head_size, p.block_size }); + value_cache_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads, p.block_size, p.head_size }); + past_lens_layout.set_partial_shape(ov::PartialShape{ -1 }); + subsequence_begins_layout.set_partial_shape(ov::PartialShape{ -1 }); + block_indices_layout.set_partial_shape(ov::PartialShape{ -1 }); + block_indices_begins_layout.set_partial_shape(ov::PartialShape{ -1 }); + + auto pa_prim = paged_attention("paged_attention", { input_info("query"), + input_info("key"), + input_info("value"), + input_info("key_cache"), + input_info("value_cache"), + input_info("past_lens"), + input_info("subsequence_begins"), + input_info("block_indices"), + input_info("block_indices_begins"), + input_info("scale"), + input_info("sliding_window"), + input_info("alibi"), + input_info("max_context_len") }); + + pa_prim.head_size = p.head_size; + pa_prim.kv_heads_num = p.num_heads; + pa_prim.heads_num = p.num_heads; + pa_prim.scale_val = pam.get_default_scale(); + pa_prim.has_alibi = false; + pa_prim.num_outputs = p.scores_output ? 2 : 1; + + topology topology; + topology.add( + input_layout("query", query_layout), + input_layout("key", key_layout), + input_layout("value", value_layout), + input_layout("key_cache", key_cache_layout), + input_layout("value_cache", value_cache_layout), + input_layout("past_lens", past_lens_layout), + input_layout("subsequence_begins", subsequence_begins_layout), + input_layout("block_indices", block_indices_layout), + input_layout("block_indices_begins", block_indices_begins_layout), + input_layout("scale", scale_layout), + input_layout("sliding_window", sliding_window_layout), + input_layout("alibi", alibi_layout), + input_layout("max_context_len", max_context_len_layout), + pa_prim, + reorder("output_data", input_info("paged_attention", 0), format::bfyx, data_types::f16) + ); + + if (p.scores_output) { + topology.add(reorder("output_scores", input_info("paged_attention", 1), format::bfyx, data_types::f16)); + } + + ExecutionConfig config = get_test_default_config(get_test_engine()); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + network::ptr network = get_network(get_test_engine(), topology, config, get_test_stream_ptr(), false); + network->set_input_data("query", query_mem); + network->set_input_data("key", key_mem); + network->set_input_data("value", value_mem); + network->set_input_data("key_cache", key_cache_mem); + network->set_input_data("value_cache", value_cache_mem); + network->set_input_data("past_lens", past_lens_mem); + network->set_input_data("subsequence_begins", subsequence_begins_mem); + network->set_input_data("block_indices", block_indices_mem); + network->set_input_data("block_indices_begins", block_indices_begins_mem); + network->set_input_data("scale", scale_mem); + network->set_input_data("sliding_window", sliding_window_mem); + network->set_input_data("alibi", alibi_mem); + network->set_input_data("max_context_len", max_context_len_mem); + + auto outputs = network->execute(); + + cldnn::memory::ptr output_data_mem = nullptr; + cldnn::memory::ptr output_scores_mem = nullptr; + + output_data_mem = outputs.at("output_data").get_memory(); + if (p.scores_output) { + output_scores_mem = outputs.at("output_scores").get_memory(); + } + + auto ref_data = PagedAttentionReference(pam).get_reference(); + compare(output_data_mem, output_scores_mem, ref_data); + } + + void compare(memory::ptr data_output_mem, memory::ptr scores_output_mem, std::pair, std::vector> ref_data) { + if (data_output_mem) { + ASSERT_EQ(data_output_mem->count(), ref_data.first.size()); + mem_lock mem_ptr(data_output_mem, get_test_stream()); + for (size_t i = 0; i < data_output_mem->count(); i++) { + ASSERT_NEAR(mem_ptr[i], ref_data.first[i], tolerance); + } + } + + if (scores_output_mem) { + ASSERT_EQ(scores_output_mem->count(), ref_data.second.size()); + mem_lock mem_ptr(scores_output_mem, get_test_stream()); + for (size_t i = 0; i < scores_output_mem->count(); i++) { + ASSERT_NEAR(mem_ptr[i], ref_data.second[i], tolerance); + } + } + } +}; + +struct paged_attention_test_params { + std::vector subsequences; + int num_heads; + int head_size; + int block_size; + bool scores_output; +}; + +class paged_attention_test : public PagedAttentionTest {}; +TEST_P(paged_attention_test, basic) { + auto p = GetParam(); + + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(smoke_paged_attention, paged_attention_test, ::testing::ValuesIn(std::vector{ + /* with scores output */ + paged_attention_test_params{ {{10, 0}}, 2, 64, 16, true }, // 1st token + paged_attention_test_params{ {{36, 0}}, 2, 64, 16, true }, // 1st token + paged_attention_test_params{ {{1024, 0}}, 2, 64, 16, true }, // 1st token long + paged_attention_test_params{ {{10, 0}, {30, 0}}, 2, 64, 16, true }, // 1st token + 1st token + paged_attention_test_params{ {{128, 0}, {256, 0}}, 2, 64, 16, true }, // 1st token + 1st token + paged_attention_test_params{ {{1, 10}}, 2, 64, 16, true }, // 2nd token + paged_attention_test_params{ {{1, 34}, {1, 515}}, 2, 64, 16, true }, // 2nd token + 2nd token + paged_attention_test_params{ {{1, 34}, {25, 0}, {10, 34}}, 2, 64, 16, true }, // mixed: 2nd token + 1st token + part of 1st token + /* without scores output */ + paged_attention_test_params{ {{10, 0}}, 2, 64, 16, false }, // 1st token + paged_attention_test_params{ {{1024, 0}}, 2, 64, 16, false }, // 1st token long + paged_attention_test_params{ {{1, 34}, {1, 515}}, 2, 64, 16, false }, // 2nd token + 2nd token +})); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp index 8ade3b6c8e0f31..0f9f119f275a78 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp @@ -2467,6 +2467,99 @@ TEST(reorder_gpu_f32, bfzyx_to_bsv16_fsv16_padded) } } +TEST(reorder_gpu_f32, bfzyx_to_bfyx_padded) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + const int32_t b_in = 1024; + const int32_t f_in = 64; + const int32_t x_in = 72; + const int32_t y_in = 2; + const int32_t z_in = 3; + + const int32_t b_crop = 1024; + const int32_t f_crop = 64; + const int32_t x_crop = 72; + const int32_t y_crop = 2; + const int32_t z_crop = 1; + + const int32_t z0_off = 0; + const int32_t z1_off = 1; + const int32_t z2_off = 2; + + auto input = engine.allocate_memory({ data_types::f32,format::bfzyx,{ b_in, f_in, x_in, y_in, z_in } }); + + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(crop("crop0", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z0_off })); + topology.add(crop("crop1", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z1_off })); + topology.add(crop("crop2", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z2_off })); + topology.add(reorder("reorder0", input_info("crop0"), format::bfyx, data_types::f32)); + topology.add(reorder("reorder1", input_info("crop1"), format::bfyx, data_types::f32)); + topology.add(reorder("reorder2", input_info("crop2"), format::bfyx, data_types::f32)); + topology.add(reshape("reshape0", input_info("reorder0"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in)))); + topology.add(reshape("reshape1", input_info("reorder1"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in)))); + topology.add(reshape("reshape2", input_info("reorder2"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in)))); + + std::vector input_vec = rg.generate_random_1d(input->count(), -10, 10); + set_values(input, input_vec); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + + network.set_input_data("input", input); + auto outputs = network.execute(); + auto output0 = outputs.at("reshape0").get_memory(); + auto output1 = outputs.at("reshape1").get_memory(); + auto output2 = outputs.at("reshape2").get_memory(); + + cldnn::mem_lock output_ptr0(output0, get_test_stream()); + for (int b = 0; b < b_crop; ++b) { + for (int f = 0; f < f_crop; ++f) { + for (int z = 0; z < z_crop; ++z) { + for (int y = 0; y < y_crop; ++y) { + for (int x = 0; x < x_crop; ++x) { + int linear_id = x + x_in * (y + y_in * (z + z0_off + z_in * (f + f_in * b))); + int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b))); + ASSERT_EQ(output_ptr0[output_linear_id], input_vec[linear_id]); + } + } + } + } + } + + cldnn::mem_lock output_ptr1(output1, get_test_stream()); + for (int b = 0; b < b_crop; ++b) { + for (int f = 0; f < f_crop; ++f) { + for (int z = 0; z < z_crop; ++z) { + for (int y = 0; y < y_crop; ++y) { + for (int x = 0; x < x_crop; ++x) { + int linear_id = x + x_in * (y + y_in * (z + z1_off + z_in * (f + f_in * b))); + int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b))); + ASSERT_EQ(output_ptr1[output_linear_id], input_vec[linear_id]); + } + } + } + } + } + + cldnn::mem_lock output_ptr2(output2, get_test_stream()); + for (int b = 0; b < b_crop; ++b) { + for (int f = 0; f < f_crop; ++f) { + for (int z = 0; z < z_crop; ++z) { + for (int y = 0; y < y_crop; ++y) { + for (int x = 0; x < x_crop; ++x) { + int linear_id = x + x_in * (y + y_in * (z + z2_off + z_in * (f + f_in * b))); + int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b))); + ASSERT_EQ(output_ptr2[output_linear_id], input_vec[linear_id]); + } + } + } + } + } +} + TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_allowed) { auto& engine = get_test_engine(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 260a1c444284cb..eb13bc8b5bd1d9 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -5,14 +5,208 @@ #include "llm_infer_request.hpp" #include "logging.hpp" +#include "openvino/op/ops.hpp" +#include "openvino/openvino.hpp" +#include "openvino/opsets/opset13.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/matcher_pass.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/pass/stateful_to_stateless.hpp" +#include "openvino/pass/validate.hpp" #include "openvino/runtime/iasync_infer_request.hpp" +namespace opp = ov::pass::pattern; +class TransposeValueTensors : public ov::pass::MatcherPass { +public: + struct Context { + std::vector> new_params; + std::vector> old_params; + using Ref = std::reference_wrapper; + }; + + OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::TransposeValueTensors"); + TransposeValueTensors(Context::Ref ctx) { + auto param = opp::wrap_type(); + auto transpose = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto concat = opp::wrap_type({param, transpose}); + auto softmax = opp::wrap_type({opp::any_input()}); + auto matmul = opp::wrap_type({softmax, concat}); + + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_node_param = node_to_output.at(param).get_node_shared_ptr(); + auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr(); + auto matched_node_transpose = node_to_output.at(transpose).get_node_shared_ptr(); + auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr(); + + auto matched_param = std::static_pointer_cast(matched_node_param); + auto matched_concat = std::static_pointer_cast(matched_node_concat); + auto matched_transpose = std::static_pointer_cast(matched_node_transpose); + auto matched_matmul = std::static_pointer_cast(matched_node_matmul); + + auto shape = matched_param->get_partial_shape(); + OPENVINO_ASSERT(shape.size() == 4u); + // NB: Transpose Parameter that correspond to V-tensor it will + // speed-up its multiplication with attention scores + std::swap(shape[2], shape[3]); + auto new_param = std::make_shared(matched_param->get_element_type(), shape); + new_param->set_friendly_name(matched_param->get_friendly_name()); + new_param->outputs().begin()->get_tensor().set_names( + matched_param->outputs().begin()->get_tensor().get_names()); + ov::replace_node(matched_param, new_param); + // NB: Save in order to add/remove to the model later on + ctx.get().new_params.push_back(new_param); + ctx.get().old_params.push_back(matched_param); + + auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1}); + auto new_transpose = + std::make_shared(matched_transpose->input_value(0), order_cst->output(0)); + new_transpose->set_friendly_name(matched_transpose->get_friendly_name()); + ov::replace_node(matched_transpose, new_transpose); + + auto new_concat = + std::make_shared(ov::OutputVector{new_param->output(0), new_transpose->output(0)}, + 3u); + new_concat->set_friendly_name(matched_concat->get_friendly_name()); + ov::replace_node(matched_concat, new_concat); + + matched_matmul->set_transpose_b(true); + + return true; + }; + register_matcher(std::make_shared(matmul, "TransposeValueTensors"), std::move(callback)); + } +}; + +class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::ScaledDotProductAttentionDecomposition"); + ScaledDotProductAttentionDecomposition() { + auto pattern_node = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + auto& pattern_to_output = m.get_pattern_value_map(); + auto node = ov::as_type_ptr( + pattern_to_output.at(pattern_node).get_node_shared_ptr()); + + if (node == nullptr || transformation_callback(node)) { + return false; + } + + auto new_output_node = decompose(node); + ov::replace_node(node, new_output_node); + return true; + }; + + auto m = std::make_shared(pattern_node, "ScaledDotProductAttentionDecomposition"); + register_matcher(m, std::move(callback)); + } + std::shared_ptr decompose(std::shared_ptr node) { + using namespace ov::op; + using namespace ov; + auto query = node->input_value(0); + auto key = node->input_value(1); + auto value = node->input_value(2); + auto q_shape = register_new_node(query, element::i32); + auto k_shape = register_new_node(key, element::i32); + auto minus_one = register_new_node(v0::Constant::create(element::i32, Shape{}, {-1})); + auto minus_two = register_new_node(v0::Constant::create(element::i32, Shape{}, {-2})); + auto zero_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {0})); + auto one_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {1})); + auto one_f = register_new_node(one_i, query); + auto zero_f = register_new_node(zero_i, query); + + Output scale; + if (node->get_input_size() < 5) { + scale = register_new_node(q_shape, minus_one, zero_i)->output(0); + scale = register_new_node(scale, query); + auto sqrt_scale = register_new_node(scale); + scale = register_new_node(one_f, sqrt_scale); + } else { + scale = node->input_value(4); + } + + auto q_scaled = register_new_node(query, scale); + auto k_rank = register_new_node(k_shape, element::i32)->output(0); + auto k_last_dim = register_new_node(k_rank, minus_one); + auto k_next_dim = register_new_node(k_rank, minus_two)->output(0); + k_rank = register_new_node(k_rank, zero_i); + auto minus_inf = + register_new_node(v0::Constant::create(element::f32, Shape{}, {-std::numeric_limits::infinity()})) + ->output(0); + auto keep_dim_last = register_new_node(k_next_dim, zero_i); + auto k_dims_before_transpose = register_new_node(zero_i, keep_dim_last, one_i, element::i32); + + auto scaled_atten = register_new_node(q_scaled, key, false, true)->output(0); + minus_inf = register_new_node(minus_inf, scaled_atten); + + if (node->get_causal() || node->get_input_size() > 3) { + Output mask; + Output atten_mask; + if (!node->get_causal()) { + mask = node->input_value(3); + + // two types of masks are supported. A boolean mask where a value of True indicates that the element + // should take part in attention. A float mask of the same type as query, key, value that is added to + // the attention score. + if (mask.get_element_type() == element::boolean) { + atten_mask = register_new_node(mask, scaled_atten); + auto inv_mask = register_new_node(mask); + atten_mask = register_new_node(inv_mask, atten_mask, minus_inf); + } else { + atten_mask = mask; + } + } else { + auto target_s_len = register_new_node(q_shape, minus_two, zero_i); + auto source_s_len = register_new_node(k_shape, minus_two, zero_i); + auto ssl = register_new_node(source_s_len, zero_i); + auto tsl = register_new_node(target_s_len, zero_i); + auto mask_shape = register_new_node(OutputVector{tsl, ssl}, 0); + mask = register_new_node(minus_inf, mask_shape); + auto horizontal_range = + register_new_node(zero_i, source_s_len, one_i, element::i32)->output(0); + horizontal_range = register_new_node(horizontal_range, zero_i); + auto stop = register_new_node(target_s_len, one_i); + auto vertical_range = register_new_node(one_i, stop, one_i, element::i32)->output(0); + vertical_range = register_new_node(vertical_range, one_i); + auto triu = register_new_node(horizontal_range, vertical_range); + atten_mask = register_new_node(triu, mask, zero_f); + } + scaled_atten = register_new_node(scaled_atten, atten_mask); + } + + scaled_atten = register_new_node(scaled_atten, -1); + auto result = register_new_node(scaled_atten, value); + result->set_friendly_name(node->get_friendly_name()); + copy_runtime_info(node, get_new_nodes()); + return result; + } +}; + namespace { uint32_t align_to(uint32_t value, uint32_t alignment) { return (value + alignment - 1) & ~(alignment - 1); } +std::shared_ptr cvt_kvcache_to_fp16(const std::shared_ptr& model) { + ov::preprocess::PrePostProcessor ppp(model); + + for (const auto& tensor : model->inputs()) { + if (tensor.get_any_name().find("past_key") != std::string::npos) { + ppp.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + for (const auto& tensor : model->outputs()) { + if (tensor.get_any_name().find("present") != std::string::npos) { + ppp.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + return ppp.build(); +} + std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { const auto kStartOutputKVCacheLayers = 1u; for (std::size_t i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { @@ -27,22 +221,33 @@ std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr cvt_kvcache_to_fp16(const std::shared_ptr& model) { +std::shared_ptr cvt_value_tensors_layout(std::shared_ptr model) { ov::preprocess::PrePostProcessor ppp(model); - - for (const auto& tensor : model->inputs()) { - if (tensor.get_any_name().find("past_key") != std::string::npos) { - ppp.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + for (auto tensor : model->outputs()) { + if (tensor.get_any_name().find("value") != std::string::npos) { + // NB: [batch, num_heads, seq_len, emb_size] -> [batch, num_heads, emb_size, seq_len] + ppp.output(tensor.get_any_name()).model().set_layout(ov::Layout("BHSE")); + ppp.output(tensor.get_any_name()).tensor().set_layout(ov::Layout("BHES")); } } + return ppp.build(); +} - for (const auto& tensor : model->outputs()) { - if (tensor.get_any_name().find("present") != std::string::npos) { - ppp.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); - } +bool optimize_value_tensors(std::shared_ptr model) { + ov::pass::GraphRewrite rewr; + rewr.add_matcher(); + TransposeValueTensors::Context ctx; + rewr.add_matcher(std::ref(ctx)); + rewr.run_on_model(model); + + model->add_parameters(ctx.new_params); + for (auto old_param : ctx.old_params) { + model->remove_parameter(old_param); } + ov::pass::Validate().run_on_model(model); - return ppp.build(); + // NB: if new_params is not empty - pass has been applied + return !ctx.new_params.empty(); } struct KVAxesPosition { @@ -116,32 +321,6 @@ std::optional extract_npu_descriptor(const std::shared_ptr(), max_tiles.as()}); } -std::optional pop_option(ov::AnyMap& config, const std::string& option_name) { - if (auto it = config.find(option_name); it != config.end()) { - std::optional found = std::make_optional(it->second); - config.erase(it); - return found; - } - return std::nullopt; -} - -template -std::optional get_option(ov::AnyMap& config, const std::string& option_name) { - if (auto it = config.find(option_name); it != config.end()) { - return std::make_optional(it->second.as()); - } - return std::nullopt; -} - -template -T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { - auto anyopt = pop_option(config, key); - if (anyopt.has_value()) { - return anyopt.value().as(); - } - return default_value; -} - ov::AnyMap get_baseline_common_config() { ov::AnyMap config = { {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"}, @@ -206,12 +385,6 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { } } -void drop_cache_dir(ov::AnyMap& config) { - if (config.count("NPU_USE_NPUW") != 0u) { - pop_option(config, "CACHE_DIR"); - } -} - void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) { for (auto it = properties.begin(); it != properties.end(); ++it) { if (it->first.find("NPUW_LLM") != it->first.npos) { @@ -251,41 +424,48 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m auto kvcache_model = model->clone(); LOG_DEBUG("2. Transform kvcache model from stateful to stateless."); ov::pass::StatefulToStateless().run_on_model(kvcache_model); - LOG_DEBUG("3. Creating prefill model as clone of transformed kvcache one."); auto prefill_model = kvcache_model->clone(); prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill"); - LOG_DEBUG("4. Converting KV-cache in prefill model to FP16."); - prefill_model = cvt_kvcache_to_fp16(prefill_model); - - LOG_DEBUG("5. Optimize kvcache kvcache model to output key/values for new token."); - kvcache_model = redirect_new_kv_to_output(kvcache_model); - LOG_DEBUG("6. Converting KV-cache in kvcache model to FP16."); - kvcache_model = cvt_kvcache_to_fp16(kvcache_model); + const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>(); const uint32_t kMaxPromptLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u); const uint32_t kMinResponseLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); - const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>(); KVAxesPosition axes = get_kv_axes(model_desc.type); m_kvcache_desc = KVCacheDesc{kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len}; - LOG_DEBUG("7. Make prefill model with static shapes"); + LOG_DEBUG("4. Make prefill model with static shapes"); reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); - LOG_DEBUG("8. Make kvcache model with static shapes"); + LOG_DEBUG("5. Make kvcache model with static shapes"); reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes); + LOG_DEBUG("6.Check and apply opt layout if applicable."); + // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model + if (model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" || + (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) { + if (optimize_value_tensors(kvcache_model)) { + // NB: Check if TransposeValueTensors transformation was applied + m_kvcache_desc.v_tensors_transposed = true; + prefill_model = cvt_value_tensors_layout(prefill_model); + } + } + LOG_DEBUG("7. Optimize kvcache model to output key/values for new token."); + kvcache_model = redirect_new_kv_to_output(kvcache_model); + LOG_DEBUG("8. Converting KV-cache in kvcache model to FP16."); + kvcache_model = cvt_kvcache_to_fp16(kvcache_model); + LOG_DEBUG("9. Converting KV-cache in prefill model to FP16."); + prefill_model = cvt_kvcache_to_fp16(prefill_model); auto npudesc = extract_npu_descriptor(plugin); - - ov::AnyMap properties_copy = std::move(other_props); + ov::AnyMap properties_copy = other_props; auto prefill_config = get_default_prefill_config(model, npudesc); + // NB: GENERATE_HINT is only applicable for default generate config! const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); - LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); + LOG_DEBUG( + "10. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); auto generate_config = get_default_generate_config(model, npudesc, generate_hint); + merge_config_with(prefill_config, properties_copy); merge_config_with(generate_config, properties_copy); - // FIXME: Drop CACHE_DIR option if NPUW is enabled - drop_cache_dir(prefill_config); - drop_cache_dir(generate_config); m_kvcache_compiled = std::make_shared(kvcache_model, plugin, generate_config); m_prefill_compiled = std::make_shared(prefill_model, plugin, prefill_config); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp index 1a748997fd48fa..e37a47b2c77948 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp @@ -22,6 +22,7 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel { uint32_t total_size = 0u; uint32_t num_stored_tokens = 0u; uint32_t dim = 0u; + bool v_tensors_transposed = false; }; LLMCompiledModel(const std::shared_ptr& model, diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index a73478c0cab5d2..12f103cc0ab6a2 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -27,6 +27,36 @@ ov::SoPtr make_tensor_slice(ov::SoPtr tensor, end_shape[dim] = end_pos; return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape)); } + +void copy_columns_by_row_chunks(ov::SoPtr src, ov::SoPtr& dst) { + const auto src_shape = src->get_shape(); + + OPENVINO_ASSERT(src_shape.size() == 4u); + OPENVINO_ASSERT(src_shape == dst->get_shape()); + OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size()); + + const auto src_strides = src->get_strides(); + const auto dst_strides = dst->get_strides(); + const auto elem_size = src->get_byte_size() / src->get_size(); + + const auto C = src_shape[1]; + const auto H = src_shape[2]; + const auto W = src_shape[3]; + + const auto IS_H = src_strides[2]; + const auto OS_H = dst_strides[2]; + + const size_t chunk_byte_size = W * elem_size; + + const auto* src_p = static_cast(src->data()); + auto* dst_p = static_cast(dst->data()); + + for (size_t i = 0; i < C * H; ++i) { + const size_t src_offset = i * IS_H; + const size_t dst_offset = i * OS_H; + std::copy_n(src_p + src_offset, chunk_byte_size, dst_p + dst_offset); + } +} } // anonymous namespace ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr& compiled_model, @@ -116,17 +146,25 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, // taking into account kvcache dimension. fill_tensor(kvcache_in_tensor, 0); + const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed) + ? 3u + : m_kvcache_desc.dim; + auto prefill_out_slice = make_tensor_slice(prefill_out_tensor, - m_kvcache_desc.dim, + kv_dim, m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size); - auto kvcache_in_slice = - make_tensor_slice(kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens); + auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, m_kvcache_desc.num_stored_tokens); - prefill_out_slice->copy_to(kvcache_in_slice._ptr); + if (kv_dim == 3u) { + copy_columns_by_row_chunks(prefill_out_slice, kvcache_in_slice); + } else { + prefill_out_slice->copy_to(kvcache_in_slice._ptr); + } } + LOG_DEBUG("Prepare attention mask pattern."); auto* attention_mask_data = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data(); @@ -156,8 +194,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name(); const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); + const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed) + ? 3u + : m_kvcache_desc.dim; auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, - m_kvcache_desc.dim, + kv_dim, m_kvcache_desc.num_stored_tokens - 1, m_kvcache_desc.num_stored_tokens); auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 5abe4b39fd44f2..0260fc9718c444 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -160,7 +160,8 @@ DQMatMulCWi::DQMatMulCWi(Context::Ref ctx) { auto qcoeff_shape = matched_node_qcoeff->output(0).get_shape(); if ((ov::element::i4 == matched_qweight->get_element_type() || - ov::element::i8 == matched_qweight->get_element_type()) && + ov::element::i8 == matched_qweight->get_element_type() || + ov::element::nf4 == matched_qweight->get_element_type()) && (ov::op::util::is_parameter(matched_node_qcoeff) || ov::op::util::is_constant(matched_node_qcoeff)) && qcoeff_shape[1] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { auto matched_node_cvtw = node_to_output.at(qcvtw).get_node_shared_ptr(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 08b4308479ef03..de3ad80280d603 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -7,5 +7,5 @@ add_subdirectory(model_hub_tests) add_subdirectory(samples_tests) add_subdirectory(e2e_tests) -install(FILES requirements_pytorch requirements_tensorflow requirements_onnx +install(FILES requirements_pytorch requirements_tensorflow requirements_onnx requirements_jax DESTINATION tests COMPONENT tests EXCLUDE_FROM_ALL) diff --git a/tests/constraints.txt b/tests/constraints.txt index 4f46cd0cc8b2e9..c339ac3c65d56f 100644 --- a/tests/constraints.txt +++ b/tests/constraints.txt @@ -21,11 +21,8 @@ pytest>=5.0,<8.4 pytest-dependency==0.5.1 pytest-html==4.1.1 pytest-timeout==2.3.1 -jax<=0.4.36 -jaxlib<=0.4.36 kornia==0.7.0 networkx<=3.3 -flax<=0.10.2 --extra-index-url https://download.pytorch.org/whl/cpu torch~=2.5.1; platform_system != "Darwin" or platform_machine != "x86_64" diff --git a/tests/e2e_tests/requirements.txt b/tests/e2e_tests/requirements.txt index 29e1c1cf31c558..a2056071e5417e 100644 --- a/tests/e2e_tests/requirements.txt +++ b/tests/e2e_tests/requirements.txt @@ -9,7 +9,7 @@ scipy>=1.5.4,<1.15 opencv-python>=4.5; sys_platform != "darwin" opencv-python==4.8.1.78; sys_platform == "darwin" unittest-xml-reporting==3.0.4 -lpips==0.1.3 +lpips==0.1.4 # for utils/e2e/comparator note: python 3.6 wheels is not available since 0.18 # Add upper-bound due CVS-105039, CVS-105040 diff --git a/tests/layer_tests/onnx_tests/test_abs.py b/tests/layer_tests/onnx_tests/test_abs.py index 9a82929ea35547..71e509faef3e65 100644 --- a/tests/layer_tests/onnx_tests/test_abs.py +++ b/tests/layer_tests/onnx_tests/test_abs.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_and.py b/tests/layer_tests/onnx_tests/test_and.py index ca5d21a42fe067..195ace1dadfa14 100644 --- a/tests/layer_tests/onnx_tests/test_and.py +++ b/tests/layer_tests/onnx_tests/test_and.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_argmax.py b/tests/layer_tests/onnx_tests/test_argmax.py index 604df5e7e69875..80d7568e9e8c4c 100644 --- a/tests/layer_tests/onnx_tests/test_argmax.py +++ b/tests/layer_tests/onnx_tests/test_argmax.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_ceil.py b/tests/layer_tests/onnx_tests/test_ceil.py index b7558630ac1c63..ea7ea10abbd31d 100644 --- a/tests/layer_tests/onnx_tests/test_ceil.py +++ b/tests/layer_tests/onnx_tests/test_ceil.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_clip.py b/tests/layer_tests/onnx_tests/test_clip.py index dbce45193034d9..3cb3ba250a12e0 100644 --- a/tests/layer_tests/onnx_tests/test_clip.py +++ b/tests/layer_tests/onnx_tests/test_clip.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_concat.py b/tests/layer_tests/onnx_tests/test_concat.py index 8627f3b198dbd3..602b6a69644527 100644 --- a/tests/layer_tests/onnx_tests/test_concat.py +++ b/tests/layer_tests/onnx_tests/test_concat.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_conv.py b/tests/layer_tests/onnx_tests/test_conv.py index b7f9729141c33e..202d6af2915c67 100644 --- a/tests/layer_tests/onnx_tests/test_conv.py +++ b/tests/layer_tests/onnx_tests/test_conv.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_cumsum.py b/tests/layer_tests/onnx_tests/test_cumsum.py index 1e197de490d518..486b1f50835fb0 100644 --- a/tests/layer_tests/onnx_tests/test_cumsum.py +++ b/tests/layer_tests/onnx_tests/test_cumsum.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_dequantize_linear.py b/tests/layer_tests/onnx_tests/test_dequantize_linear.py index 9090f3a829919b..319030590a3f0d 100644 --- a/tests/layer_tests/onnx_tests/test_dequantize_linear.py +++ b/tests/layer_tests/onnx_tests/test_dequantize_linear.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_elu.py b/tests/layer_tests/onnx_tests/test_elu.py index dbffc32d09c6c7..9f0321ec9a6ee3 100644 --- a/tests/layer_tests/onnx_tests/test_elu.py +++ b/tests/layer_tests/onnx_tests/test_elu.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_embedding_bag.py b/tests/layer_tests/onnx_tests/test_embedding_bag.py index a18a59b9752f16..54d940c01fb36c 100644 --- a/tests/layer_tests/onnx_tests/test_embedding_bag.py +++ b/tests/layer_tests/onnx_tests/test_embedding_bag.py @@ -5,6 +5,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + import torch import torch.nn as nn from common.layer_test_class import CommonLayerTest, check_ir_version diff --git a/tests/layer_tests/onnx_tests/test_floor.py b/tests/layer_tests/onnx_tests/test_floor.py index 87ad058c510e8c..5076befc414941 100644 --- a/tests/layer_tests/onnx_tests/test_floor.py +++ b/tests/layer_tests/onnx_tests/test_floor.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_gather.py b/tests/layer_tests/onnx_tests/test_gather.py index a45d5b4f4a916b..9380de31c6dccc 100644 --- a/tests/layer_tests/onnx_tests/test_gather.py +++ b/tests/layer_tests/onnx_tests/test_gather.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_hard_sigmoid.py b/tests/layer_tests/onnx_tests/test_hard_sigmoid.py index 12986c590d41d4..a62ab2a7fc54e8 100644 --- a/tests/layer_tests/onnx_tests/test_hard_sigmoid.py +++ b/tests/layer_tests/onnx_tests/test_hard_sigmoid.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_identity.py b/tests/layer_tests/onnx_tests/test_identity.py index a86c0e2a687257..e58e272de49ec0 100644 --- a/tests/layer_tests/onnx_tests/test_identity.py +++ b/tests/layer_tests/onnx_tests/test_identity.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_leaky_relu.py b/tests/layer_tests/onnx_tests/test_leaky_relu.py index 3a12bfcd92c33e..cff9cd87b59d30 100644 --- a/tests/layer_tests/onnx_tests/test_leaky_relu.py +++ b/tests/layer_tests/onnx_tests/test_leaky_relu.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_log.py b/tests/layer_tests/onnx_tests/test_log.py index db0a329aa09746..53e2c42505bf7b 100644 --- a/tests/layer_tests/onnx_tests/test_log.py +++ b/tests/layer_tests/onnx_tests/test_log.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_logsoftmax.py b/tests/layer_tests/onnx_tests/test_logsoftmax.py index a81b20402d50dd..057376d6ed48b2 100644 --- a/tests/layer_tests/onnx_tests/test_logsoftmax.py +++ b/tests/layer_tests/onnx_tests/test_logsoftmax.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_lrn.py b/tests/layer_tests/onnx_tests/test_lrn.py index 0e8f34129a300f..1c1cf62d5d12b4 100644 --- a/tests/layer_tests/onnx_tests/test_lrn.py +++ b/tests/layer_tests/onnx_tests/test_lrn.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_neg.py b/tests/layer_tests/onnx_tests/test_neg.py index d19991cb8a6b12..98f6acd728f637 100644 --- a/tests/layer_tests/onnx_tests/test_neg.py +++ b/tests/layer_tests/onnx_tests/test_neg.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_non_zero.py b/tests/layer_tests/onnx_tests/test_non_zero.py index 464304651a2a19..a2035b4ab27d63 100644 --- a/tests/layer_tests/onnx_tests/test_non_zero.py +++ b/tests/layer_tests/onnx_tests/test_non_zero.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_not.py b/tests/layer_tests/onnx_tests/test_not.py index 05a6c7ffbb2e2d..1caf8e2e7a770c 100644 --- a/tests/layer_tests/onnx_tests/test_not.py +++ b/tests/layer_tests/onnx_tests/test_not.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_or.py b/tests/layer_tests/onnx_tests/test_or.py index 285c90765d6a7e..6db35aff2f500e 100644 --- a/tests/layer_tests/onnx_tests/test_or.py +++ b/tests/layer_tests/onnx_tests/test_or.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_pad.py b/tests/layer_tests/onnx_tests/test_pad.py index abacc530d93144..161db0685b6fa8 100644 --- a/tests/layer_tests/onnx_tests/test_pad.py +++ b/tests/layer_tests/onnx_tests/test_pad.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_pooling.py b/tests/layer_tests/onnx_tests/test_pooling.py index 85e7fc883fc5d8..2bc2251f8aea49 100644 --- a/tests/layer_tests/onnx_tests/test_pooling.py +++ b/tests/layer_tests/onnx_tests/test_pooling.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_prelu.py b/tests/layer_tests/onnx_tests/test_prelu.py index f20e89b7006a44..59a1e8f4f415e1 100644 --- a/tests/layer_tests/onnx_tests/test_prelu.py +++ b/tests/layer_tests/onnx_tests/test_prelu.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_reduce.py b/tests/layer_tests/onnx_tests/test_reduce.py index 58141e18260016..46b4008c4e653d 100644 --- a/tests/layer_tests/onnx_tests/test_reduce.py +++ b/tests/layer_tests/onnx_tests/test_reduce.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_reduce_lp.py b/tests/layer_tests/onnx_tests/test_reduce_lp.py index 2ff4511ef87443..3cf2f5e133b895 100644 --- a/tests/layer_tests/onnx_tests/test_reduce_lp.py +++ b/tests/layer_tests/onnx_tests/test_reduce_lp.py @@ -5,6 +5,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_relu.py b/tests/layer_tests/onnx_tests/test_relu.py index ce597920923289..520749ed948b25 100644 --- a/tests/layer_tests/onnx_tests/test_relu.py +++ b/tests/layer_tests/onnx_tests/test_relu.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_reshape.py b/tests/layer_tests/onnx_tests/test_reshape.py index 637beeb4388bbb..28eb339af52f9e 100644 --- a/tests/layer_tests/onnx_tests/test_reshape.py +++ b/tests/layer_tests/onnx_tests/test_reshape.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_resize.py b/tests/layer_tests/onnx_tests/test_resize.py index 4d28afdb50fe38..36a808fa859ef1 100644 --- a/tests/layer_tests/onnx_tests/test_resize.py +++ b/tests/layer_tests/onnx_tests/test_resize.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_roi_align.py b/tests/layer_tests/onnx_tests/test_roi_align.py index 4cd49c50c20bf8..d5cedf4e1a0f06 100644 --- a/tests/layer_tests/onnx_tests/test_roi_align.py +++ b/tests/layer_tests/onnx_tests/test_roi_align.py @@ -5,6 +5,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model from unit_tests.utils.graph import build_graph diff --git a/tests/layer_tests/onnx_tests/test_scatter.py b/tests/layer_tests/onnx_tests/test_scatter.py index 578300e144bc3d..baaa0392553fbf 100644 --- a/tests/layer_tests/onnx_tests/test_scatter.py +++ b/tests/layer_tests/onnx_tests/test_scatter.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_sigmoid.py b/tests/layer_tests/onnx_tests/test_sigmoid.py index 5dcb3e8f1b112a..db055a6d9030ac 100644 --- a/tests/layer_tests/onnx_tests/test_sigmoid.py +++ b/tests/layer_tests/onnx_tests/test_sigmoid.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_sign.py b/tests/layer_tests/onnx_tests/test_sign.py index 07f4f169a7bc1b..70c0ffcc0033ec 100644 --- a/tests/layer_tests/onnx_tests/test_sign.py +++ b/tests/layer_tests/onnx_tests/test_sign.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_softmax.py b/tests/layer_tests/onnx_tests/test_softmax.py index c4d9d600276402..390b1a894549c3 100644 --- a/tests/layer_tests/onnx_tests/test_softmax.py +++ b/tests/layer_tests/onnx_tests/test_softmax.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_softplus.py b/tests/layer_tests/onnx_tests/test_softplus.py index cdcbbbf3e8ed13..b0127c0dcf0624 100644 --- a/tests/layer_tests/onnx_tests/test_softplus.py +++ b/tests/layer_tests/onnx_tests/test_softplus.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_softsign.py b/tests/layer_tests/onnx_tests/test_softsign.py index 30ca27402c7878..75043b57b80dc7 100644 --- a/tests/layer_tests/onnx_tests/test_softsign.py +++ b/tests/layer_tests/onnx_tests/test_softsign.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_sqrt.py b/tests/layer_tests/onnx_tests/test_sqrt.py index 9c4733a68cd9fa..24dbbcac659df4 100644 --- a/tests/layer_tests/onnx_tests/test_sqrt.py +++ b/tests/layer_tests/onnx_tests/test_sqrt.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_trigonometry.py b/tests/layer_tests/onnx_tests/test_trigonometry.py index 563b63b1e5632d..99651091ea2e96 100644 --- a/tests/layer_tests/onnx_tests/test_trigonometry.py +++ b/tests/layer_tests/onnx_tests/test_trigonometry.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_where.py b/tests/layer_tests/onnx_tests/test_where.py index fb358a2ced8415..1bf845340b3922 100644 --- a/tests/layer_tests/onnx_tests/test_where.py +++ b/tests/layer_tests/onnx_tests/test_where.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/onnx_tests/test_xor.py b/tests/layer_tests/onnx_tests/test_xor.py index 2790a31784ff59..e7f0c11f8362a2 100644 --- a/tests/layer_tests/onnx_tests/test_xor.py +++ b/tests/layer_tests/onnx_tests/test_xor.py @@ -3,6 +3,8 @@ import numpy as np import pytest +pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136") + from common.layer_test_class import check_ir_version from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model diff --git a/tests/layer_tests/requirements.txt b/tests/layer_tests/requirements.txt index 04889ebce10a39..2ba12cc5e2bece 100644 --- a/tests/layer_tests/requirements.txt +++ b/tests/layer_tests/requirements.txt @@ -16,5 +16,3 @@ pytest defusedxml tensorflow tensorflow-addons; python_version <= '3.10' -jax; sys_platform == "linux" and platform_machine == "x86_64" # https://jax.readthedocs.io/en/latest/installation.html#pip-installation-cpu - wheels are for "x86_64" only -jaxlib; sys_platform == "linux" and platform_machine == "x86_64" # https://jax.readthedocs.io/en/latest/installation.html#pip-installation-cpu - wheels are for "x86_64" only diff --git a/tests/layer_tests/tensorflow_tests/test_tf_UnaryOpsAllRealDomain.py b/tests/layer_tests/tensorflow_tests/test_tf_UnaryOpsAllRealDomain.py index 4ff4d589cbae32..5c1037e38cfc84 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_UnaryOpsAllRealDomain.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_UnaryOpsAllRealDomain.py @@ -67,4 +67,4 @@ def test_unary_ops(self, input_shape, input_type, op_type, pytest.skip("159585: accuracy error on ARM") self._test(*self.create_unary_net(input_shape, input_type, op_type), ie_device, precision, ir_version, temp_dir=temp_dir, - use_legacy_frontend=use_legacy_frontend, custom_eps=1e-3) + use_legacy_frontend=use_legacy_frontend, custom_eps=3 * 1e-3) diff --git a/tests/model_hub_tests/jax/requirements.txt b/tests/model_hub_tests/jax/requirements.txt deleted file mode 100644 index 328084ac050ca6..00000000000000 --- a/tests/model_hub_tests/jax/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ --c ../../constraints.txt -numpy -pytest -pytest-html -transformers -requests -jax -jaxlib -flax -pillow \ No newline at end of file diff --git a/tests/requirements_jax b/tests/requirements_jax new file mode 100644 index 00000000000000..c392df4359bee3 --- /dev/null +++ b/tests/requirements_jax @@ -0,0 +1,13 @@ +numpy==1.26.4; python_version < "3.12" or platform_system == "Darwin" and platform_machine == "x86_64" +numpy==2.2.1; python_version >= "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64") +pytest==7.0.1 +pytest-xdist[psutil]==3.6.1 +pytest-html==4.1.1 +jax==0.4.38; (platform_system != "Darwin" or platform_machine != "x86_64") and python_version > "3.9" +# tensorflow 2.16.2 depends on ml-dtypes~=0.3.1 and jax 0.4.35 depends on ml-dtypes>=0.4.0 +jax==0.4.33; (platform_system == "Darwin" and platform_machine == "x86_64") and python_version > "3.9" +jax==0.4.30; python_version <= "3.9" +flax==0.10.2 +transformers==4.47.1 +defusedxml +pillow diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch index f42deb81839883..33907145f7de4b 100644 --- a/tests/requirements_pytorch +++ b/tests/requirements_pytorch @@ -14,7 +14,8 @@ torchaudio==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" # transformers 4.45.1 is available # but optimum still requires <4.45.0 transformers==4.44.2 -pytest==7.0.1 +pytest==7.0.1; python_version < '3.10' +pytest==7.2.0; python_version >= '3.10' pytest-html==4.1.1 pytest-xdist[psutil]==3.6.1 defusedxml==0.7.1 diff --git a/tests/requirements_tensorflow b/tests/requirements_tensorflow index 5369b0135f7618..5d699facad1c91 100644 --- a/tests/requirements_tensorflow +++ b/tests/requirements_tensorflow @@ -4,7 +4,8 @@ # tensorflow 2.16.2 depends on numpy<2.0.0 and >=1.26.0; python_version >= "3.12" numpy==1.26.4; python_version < "3.12" or platform_system == "Darwin" and platform_machine == "x86_64" numpy==2.0.2; python_version >= "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64") -pytest==7.0.1 +pytest==7.0.1; python_version < '3.10' +pytest==7.2.0; python_version >= '3.10' pytest-xdist[psutil]==3.6.1 pytest-html==4.1.1 transformers==4.45.1 @@ -17,7 +18,7 @@ wrapt==1.15.0; python_version >= "3.12" # tensorflow-text is not available for both Windows and ARM platforms tensorflow-text==2.18.0; python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64" tensorflow-hub==0.16.1 -jax==0.4.35; (platform_system != "Darwin" or platform_machine != "x86_64") and python_version > "3.9" +jax==0.4.38; (platform_system != "Darwin" or platform_machine != "x86_64") and python_version > "3.9" # tensorflow 2.16.2 depends on ml-dtypes~=0.3.1 and jax 0.4.35 depends on ml-dtypes>=0.4.0 jax==0.4.33; (platform_system == "Darwin" and platform_machine == "x86_64") and python_version > "3.9" jax==0.4.30; python_version <= "3.9" diff --git a/tools/benchmark_tool/openvino/__init__.py b/tools/benchmark_tool/openvino/__init__.py index 7643f742e0067d..69c678909b1c9e 100644 --- a/tools/benchmark_tool/openvino/__init__.py +++ b/tools/benchmark_tool/openvino/__init__.py @@ -7,7 +7,7 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.package_utils import _add_openvino_libs_to_search_path + from openvino.utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass @@ -17,47 +17,6 @@ # # This __init__.py forces checking of runtime modules to propagate errors. # # It is not compared with init files from openvino-dev package. # # - -# Openvino pybind bindings -from openvino._pyopenvino import AxisSet -from openvino._pyopenvino import AxisVector -from openvino._pyopenvino import ConstOutput -from openvino._pyopenvino import Coordinate -from openvino._pyopenvino import CoordinateDiff -from openvino._pyopenvino import DiscreteTypeInfo -from openvino._pyopenvino import Extension -from openvino._pyopenvino import ProfilingInfo -from openvino._pyopenvino import RTMap -from openvino._pyopenvino import Version -from openvino._pyopenvino import Symbol -from openvino._pyopenvino import Dimension -from openvino._pyopenvino import Input -from openvino._pyopenvino import Output -from openvino._pyopenvino import Node -from openvino._pyopenvino import Strides -from openvino._pyopenvino import PartialShape -from openvino._pyopenvino import Shape -from openvino._pyopenvino import Layout -from openvino._pyopenvino import Type -from openvino._pyopenvino import Tensor -from openvino._pyopenvino import OVAny -from openvino._pyopenvino import get_batch -from openvino._pyopenvino import set_batch -from openvino._pyopenvino import serialize -from openvino._pyopenvino import shutdown -from openvino._pyopenvino import save_model -from openvino._pyopenvino import layout_helpers -from openvino._pyopenvino import RemoteContext -from openvino._pyopenvino import RemoteTensor -from openvino._pyopenvino import Op - -# Import public classes from _ov_api -from openvino._ov_api import Model -from openvino._ov_api import Core -from openvino._ov_api import CompiledModel -from openvino._ov_api import InferRequest -from openvino._ov_api import AsyncInferQueue - # Import all public modules from openvino import runtime as runtime from openvino import frontend as frontend @@ -67,10 +26,36 @@ from openvino import utils as utils from openvino import properties as properties +# Import most important classes and functions from openvino.runtime +from openvino._ov_api import Model +from openvino._ov_api import Core +from openvino._ov_api import CompiledModel +from openvino._ov_api import InferRequest +from openvino._ov_api import AsyncInferQueue + +from openvino.runtime import Symbol +from openvino.runtime import Dimension +from openvino.runtime import Strides +from openvino.runtime import PartialShape +from openvino.runtime import Shape +from openvino.runtime import Layout +from openvino.runtime import Type +from openvino.runtime import Tensor +from openvino.runtime import OVAny + # Helper functions for openvino module -from openvino.utils.data_helpers import tensor_from_file +from openvino.runtime.utils.data_helpers import tensor_from_file from openvino._ov_api import compile_model +from openvino.runtime import get_batch +from openvino.runtime import set_batch +from openvino.runtime import serialize +from openvino.runtime import shutdown +from openvino.runtime import save_model +from openvino.runtime import layout_helpers +from openvino._pyopenvino import RemoteContext +from openvino._pyopenvino import RemoteTensor +from openvino._pyopenvino import Op # Import opsets from openvino import opset1 @@ -95,7 +80,7 @@ from openvino._pyopenvino import VASurfaceTensor # Set version for openvino package -from openvino._pyopenvino import get_version +from openvino.runtime import get_version __version__ = get_version() # Tools diff --git a/tools/mo/openvino/__init__.py b/tools/mo/openvino/__init__.py index 7643f742e0067d..b015570964c520 100644 --- a/tools/mo/openvino/__init__.py +++ b/tools/mo/openvino/__init__.py @@ -7,96 +7,61 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.package_utils import _add_openvino_libs_to_search_path + from openvino.utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass -# # -# # OpenVINO API -# # This __init__.py forces checking of runtime modules to propagate errors. -# # It is not compared with init files from openvino-dev package. -# # - -# Openvino pybind bindings -from openvino._pyopenvino import AxisSet -from openvino._pyopenvino import AxisVector -from openvino._pyopenvino import ConstOutput -from openvino._pyopenvino import Coordinate -from openvino._pyopenvino import CoordinateDiff -from openvino._pyopenvino import DiscreteTypeInfo -from openvino._pyopenvino import Extension -from openvino._pyopenvino import ProfilingInfo -from openvino._pyopenvino import RTMap -from openvino._pyopenvino import Version -from openvino._pyopenvino import Symbol -from openvino._pyopenvino import Dimension -from openvino._pyopenvino import Input -from openvino._pyopenvino import Output -from openvino._pyopenvino import Node -from openvino._pyopenvino import Strides -from openvino._pyopenvino import PartialShape -from openvino._pyopenvino import Shape -from openvino._pyopenvino import Layout -from openvino._pyopenvino import Type -from openvino._pyopenvino import Tensor -from openvino._pyopenvino import OVAny -from openvino._pyopenvino import get_batch -from openvino._pyopenvino import set_batch -from openvino._pyopenvino import serialize -from openvino._pyopenvino import shutdown -from openvino._pyopenvino import save_model -from openvino._pyopenvino import layout_helpers -from openvino._pyopenvino import RemoteContext -from openvino._pyopenvino import RemoteTensor -from openvino._pyopenvino import Op - -# Import public classes from _ov_api -from openvino._ov_api import Model -from openvino._ov_api import Core -from openvino._ov_api import CompiledModel -from openvino._ov_api import InferRequest -from openvino._ov_api import AsyncInferQueue +# OpenVINO API +try: + # Import all public modules + from openvino import runtime as runtime + from openvino import frontend as frontend + from openvino import helpers as helpers + from openvino import preprocess as preprocess + from openvino import utils as utils + from openvino import properties as properties -# Import all public modules -from openvino import runtime as runtime -from openvino import frontend as frontend -from openvino import helpers as helpers -from openvino import experimental as experimental -from openvino import preprocess as preprocess -from openvino import utils as utils -from openvino import properties as properties + # Import most important classes and functions from openvino.runtime + from openvino.runtime import Model + from openvino.runtime import Core + from openvino.runtime import CompiledModel + from openvino.runtime import InferRequest + from openvino.runtime import AsyncInferQueue -# Helper functions for openvino module -from openvino.utils.data_helpers import tensor_from_file -from openvino._ov_api import compile_model + from openvino.runtime import Symbol + from openvino.runtime import Dimension + from openvino.runtime import Strides + from openvino.runtime import PartialShape + from openvino.runtime import Shape + from openvino.runtime import Layout + from openvino.runtime import Type + from openvino.runtime import Tensor + from openvino.runtime import OVAny + from openvino.runtime import compile_model + from openvino.runtime import get_batch + from openvino.runtime import set_batch + from openvino.runtime import serialize + from openvino.runtime import shutdown + from openvino.runtime import tensor_from_file + from openvino.runtime import save_model + from openvino.runtime import layout_helpers -# Import opsets -from openvino import opset1 -from openvino import opset2 -from openvino import opset3 -from openvino import opset4 -from openvino import opset5 -from openvino import opset6 -from openvino import opset7 -from openvino import opset8 -from openvino import opset9 -from openvino import opset10 -from openvino import opset11 -from openvino import opset12 -from openvino import opset13 -from openvino import opset14 -from openvino import opset15 -from openvino import opset16 + from openvino._pyopenvino import RemoteContext + from openvino._pyopenvino import RemoteTensor + from openvino._pyopenvino import Op -# libva related: -from openvino._pyopenvino import VAContext -from openvino._pyopenvino import VASurfaceTensor + # libva related: + from openvino._pyopenvino import VAContext + from openvino._pyopenvino import VASurfaceTensor -# Set version for openvino package -from openvino._pyopenvino import get_version -__version__ = get_version() + # Set version for openvino package + from openvino.runtime import get_version + __version__ = get_version() +except ImportError: + import warnings + warnings.warn("openvino package has problems with imports!", ImportWarning, stacklevel=2) # Tools try: diff --git a/tools/openvino_dev/src/openvino/__init__.py b/tools/openvino_dev/src/openvino/__init__.py index 7643f742e0067d..b015570964c520 100644 --- a/tools/openvino_dev/src/openvino/__init__.py +++ b/tools/openvino_dev/src/openvino/__init__.py @@ -7,96 +7,61 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.package_utils import _add_openvino_libs_to_search_path + from openvino.utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass -# # -# # OpenVINO API -# # This __init__.py forces checking of runtime modules to propagate errors. -# # It is not compared with init files from openvino-dev package. -# # - -# Openvino pybind bindings -from openvino._pyopenvino import AxisSet -from openvino._pyopenvino import AxisVector -from openvino._pyopenvino import ConstOutput -from openvino._pyopenvino import Coordinate -from openvino._pyopenvino import CoordinateDiff -from openvino._pyopenvino import DiscreteTypeInfo -from openvino._pyopenvino import Extension -from openvino._pyopenvino import ProfilingInfo -from openvino._pyopenvino import RTMap -from openvino._pyopenvino import Version -from openvino._pyopenvino import Symbol -from openvino._pyopenvino import Dimension -from openvino._pyopenvino import Input -from openvino._pyopenvino import Output -from openvino._pyopenvino import Node -from openvino._pyopenvino import Strides -from openvino._pyopenvino import PartialShape -from openvino._pyopenvino import Shape -from openvino._pyopenvino import Layout -from openvino._pyopenvino import Type -from openvino._pyopenvino import Tensor -from openvino._pyopenvino import OVAny -from openvino._pyopenvino import get_batch -from openvino._pyopenvino import set_batch -from openvino._pyopenvino import serialize -from openvino._pyopenvino import shutdown -from openvino._pyopenvino import save_model -from openvino._pyopenvino import layout_helpers -from openvino._pyopenvino import RemoteContext -from openvino._pyopenvino import RemoteTensor -from openvino._pyopenvino import Op - -# Import public classes from _ov_api -from openvino._ov_api import Model -from openvino._ov_api import Core -from openvino._ov_api import CompiledModel -from openvino._ov_api import InferRequest -from openvino._ov_api import AsyncInferQueue +# OpenVINO API +try: + # Import all public modules + from openvino import runtime as runtime + from openvino import frontend as frontend + from openvino import helpers as helpers + from openvino import preprocess as preprocess + from openvino import utils as utils + from openvino import properties as properties -# Import all public modules -from openvino import runtime as runtime -from openvino import frontend as frontend -from openvino import helpers as helpers -from openvino import experimental as experimental -from openvino import preprocess as preprocess -from openvino import utils as utils -from openvino import properties as properties + # Import most important classes and functions from openvino.runtime + from openvino.runtime import Model + from openvino.runtime import Core + from openvino.runtime import CompiledModel + from openvino.runtime import InferRequest + from openvino.runtime import AsyncInferQueue -# Helper functions for openvino module -from openvino.utils.data_helpers import tensor_from_file -from openvino._ov_api import compile_model + from openvino.runtime import Symbol + from openvino.runtime import Dimension + from openvino.runtime import Strides + from openvino.runtime import PartialShape + from openvino.runtime import Shape + from openvino.runtime import Layout + from openvino.runtime import Type + from openvino.runtime import Tensor + from openvino.runtime import OVAny + from openvino.runtime import compile_model + from openvino.runtime import get_batch + from openvino.runtime import set_batch + from openvino.runtime import serialize + from openvino.runtime import shutdown + from openvino.runtime import tensor_from_file + from openvino.runtime import save_model + from openvino.runtime import layout_helpers -# Import opsets -from openvino import opset1 -from openvino import opset2 -from openvino import opset3 -from openvino import opset4 -from openvino import opset5 -from openvino import opset6 -from openvino import opset7 -from openvino import opset8 -from openvino import opset9 -from openvino import opset10 -from openvino import opset11 -from openvino import opset12 -from openvino import opset13 -from openvino import opset14 -from openvino import opset15 -from openvino import opset16 + from openvino._pyopenvino import RemoteContext + from openvino._pyopenvino import RemoteTensor + from openvino._pyopenvino import Op -# libva related: -from openvino._pyopenvino import VAContext -from openvino._pyopenvino import VASurfaceTensor + # libva related: + from openvino._pyopenvino import VAContext + from openvino._pyopenvino import VASurfaceTensor -# Set version for openvino package -from openvino._pyopenvino import get_version -__version__ = get_version() + # Set version for openvino package + from openvino.runtime import get_version + __version__ = get_version() +except ImportError: + import warnings + warnings.warn("openvino package has problems with imports!", ImportWarning, stacklevel=2) # Tools try: diff --git a/tools/ovc/openvino/__init__.py b/tools/ovc/openvino/__init__.py index 7643f742e0067d..69c678909b1c9e 100644 --- a/tools/ovc/openvino/__init__.py +++ b/tools/ovc/openvino/__init__.py @@ -7,7 +7,7 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.package_utils import _add_openvino_libs_to_search_path + from openvino.utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass @@ -17,47 +17,6 @@ # # This __init__.py forces checking of runtime modules to propagate errors. # # It is not compared with init files from openvino-dev package. # # - -# Openvino pybind bindings -from openvino._pyopenvino import AxisSet -from openvino._pyopenvino import AxisVector -from openvino._pyopenvino import ConstOutput -from openvino._pyopenvino import Coordinate -from openvino._pyopenvino import CoordinateDiff -from openvino._pyopenvino import DiscreteTypeInfo -from openvino._pyopenvino import Extension -from openvino._pyopenvino import ProfilingInfo -from openvino._pyopenvino import RTMap -from openvino._pyopenvino import Version -from openvino._pyopenvino import Symbol -from openvino._pyopenvino import Dimension -from openvino._pyopenvino import Input -from openvino._pyopenvino import Output -from openvino._pyopenvino import Node -from openvino._pyopenvino import Strides -from openvino._pyopenvino import PartialShape -from openvino._pyopenvino import Shape -from openvino._pyopenvino import Layout -from openvino._pyopenvino import Type -from openvino._pyopenvino import Tensor -from openvino._pyopenvino import OVAny -from openvino._pyopenvino import get_batch -from openvino._pyopenvino import set_batch -from openvino._pyopenvino import serialize -from openvino._pyopenvino import shutdown -from openvino._pyopenvino import save_model -from openvino._pyopenvino import layout_helpers -from openvino._pyopenvino import RemoteContext -from openvino._pyopenvino import RemoteTensor -from openvino._pyopenvino import Op - -# Import public classes from _ov_api -from openvino._ov_api import Model -from openvino._ov_api import Core -from openvino._ov_api import CompiledModel -from openvino._ov_api import InferRequest -from openvino._ov_api import AsyncInferQueue - # Import all public modules from openvino import runtime as runtime from openvino import frontend as frontend @@ -67,10 +26,36 @@ from openvino import utils as utils from openvino import properties as properties +# Import most important classes and functions from openvino.runtime +from openvino._ov_api import Model +from openvino._ov_api import Core +from openvino._ov_api import CompiledModel +from openvino._ov_api import InferRequest +from openvino._ov_api import AsyncInferQueue + +from openvino.runtime import Symbol +from openvino.runtime import Dimension +from openvino.runtime import Strides +from openvino.runtime import PartialShape +from openvino.runtime import Shape +from openvino.runtime import Layout +from openvino.runtime import Type +from openvino.runtime import Tensor +from openvino.runtime import OVAny + # Helper functions for openvino module -from openvino.utils.data_helpers import tensor_from_file +from openvino.runtime.utils.data_helpers import tensor_from_file from openvino._ov_api import compile_model +from openvino.runtime import get_batch +from openvino.runtime import set_batch +from openvino.runtime import serialize +from openvino.runtime import shutdown +from openvino.runtime import save_model +from openvino.runtime import layout_helpers +from openvino._pyopenvino import RemoteContext +from openvino._pyopenvino import RemoteTensor +from openvino._pyopenvino import Op # Import opsets from openvino import opset1 @@ -95,7 +80,7 @@ from openvino._pyopenvino import VASurfaceTensor # Set version for openvino package -from openvino._pyopenvino import get_version +from openvino.runtime import get_version __version__ = get_version() # Tools