diff --git a/.github/scripts/workflow_rerun/errors_to_look_for.json b/.github/scripts/workflow_rerun/errors_to_look_for.json
index b9cac8f17adaa6..d8fe6ac2df03d2 100644
--- a/.github/scripts/workflow_rerun/errors_to_look_for.json
+++ b/.github/scripts/workflow_rerun/errors_to_look_for.json
@@ -86,5 +86,25 @@
     {
         "error_text": "because the GET request got Content-Type",
         "ticket": 158400
+    },
+    {
+        "error_text": "Unable to make request:",
+        "ticket": 158401
+    },
+    {
+        "error_text": "Failed to make request",
+        "ticket": 158401
+    },
+    {
+        "error_text": "Failure when receiving data from the peer",
+        "ticket": 159323
+    },
+    {
+        "error_text": "HTTP response code said error",
+        "ticket": 159398
+    },
+    {
+        "error_text": "download failed after attempts",
+        "ticket": 159547
     }
 ]
\ No newline at end of file
diff --git a/.github/workflows/cleanup_caches.yml b/.github/workflows/cleanup_caches.yml
index d6633fd9dab3ee..c3aac30ccd4379 100644
--- a/.github/workflows/cleanup_caches.yml
+++ b/.github/workflows/cleanup_caches.yml
@@ -4,7 +4,7 @@ on:
   schedule:
     # at 00:00 on the 1st day of every month
     - cron: '0 0 1 * *'
- 
+
 permissions: read-all
 
 jobs:
@@ -61,8 +61,8 @@ jobs:
           cache-path: ${{ env.CCACHE_PATH }}
           recursive: true
           key: '.'
-          
-          
+
+
   Cleanup_ccache_win:
     name: Cleanup Windows ccache
     runs-on: 'aks-win-4-cores-8gb'
diff --git a/.github/workflows/export_workflow_metrics.yml b/.github/workflows/export_workflow_metrics.yml
index 39bb699b8caa91..aef00244f8175b 100644
--- a/.github/workflows/export_workflow_metrics.yml
+++ b/.github/workflows/export_workflow_metrics.yml
@@ -34,7 +34,7 @@ permissions: read-all
 jobs:
   export-workflow-metrics:
     name: Export finished workflow metrics
-    runs-on: aks-linux-2-cores-8gb
+    runs-on: aks-linux-2-cores-8gb-stats
     if: ${{ github.repository_owner == 'openvinotoolkit' }}
 
     steps:
diff --git a/.github/workflows/job_jax_layer_tests.yml b/.github/workflows/job_jax_layer_tests.yml
new file mode 100644
index 00000000000000..25f171060f43be
--- /dev/null
+++ b/.github/workflows/job_jax_layer_tests.yml
@@ -0,0 +1,133 @@
+name: JAX Layer Tests
+
+on:
+  workflow_call:
+    inputs:
+      runner:
+        description: 'Machine on which the tests would run'
+        type: string
+        required: true
+      container:
+        description: 'JSON to be converted to the value of the "container" configuration for the job'
+        type: string
+        required: false
+        default: '{"image": null}'
+      affected-components:
+        description: 'Components that are affected by changes in the commit defined by the Smart CI Action'
+        type: string
+        required: true
+      python-version:
+        description: 'Python version to setup. E.g., "3.11"'
+        type: string
+        required: true
+
+permissions: read-all
+
+env:
+  PIP_CACHE_PATH_LINUX: /mount/caches/pip/linux
+  PIP_CACHE_PATH_WIN: "C:\\mount\\caches\\pip\\win"
+
+jobs:
+  JAX_Layer_Tests:
+    name: JAX Layer Tests
+    timeout-minutes: 40
+    runs-on: ${{ inputs.runner }}
+    container: ${{ fromJSON(inputs.container) }}
+    defaults:
+      run:
+        shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }}
+    env:
+      DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
+      OPENVINO_REPO: ${{ github.workspace }}/openvino
+      INSTALL_DIR: ${{ github.workspace }}/install
+      INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests
+      INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels
+      LAYER_TESTS_INSTALL_DIR: ${{ github.workspace }}/install/tests/layer_tests
+    steps:
+      - name: Download OpenVINO artifacts (tarballs)
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          pattern: openvino_[tests]*
+          path: ${{ env.INSTALL_DIR }}
+          merge-multiple: true
+          
+      - name: Download OpenVINO artifacts (wheels)
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          pattern: openvino_[wheels]*
+          path: ${{ env.INSTALL_WHEELS_DIR }}
+          merge-multiple: true
+
+      # Needed as ${{ github.workspace }} is not working correctly when using Docker
+      - name: Setup Variables
+        if: runner.os != 'Windows'
+        run: |
+          echo "OPENVINO_REPO=$GITHUB_WORKSPACE/openvino" >> "$GITHUB_ENV"
+          echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
+          echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
+          echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV"
+          echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV"
+  
+      - name: Install OpenVINO dependencies (mac)
+        if: runner.os == 'macOS'
+        run: brew install pigz
+          
+      - name: Extract OpenVINO packages (Linux, macOS)
+        if: runner.os != 'Windows'
+        run: |
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
+        working-directory: ${{ env.INSTALL_DIR }}
+
+      - name: Extract OpenVINO artifacts (Windows)
+        if: runner.os == 'Windows'
+        run: |
+            Expand-Archive openvino_tests.zip -DestinationPath ${{ env.INSTALL_DIR }}
+        working-directory: ${{ env.INSTALL_DIR }}
+
+      - name: Fetch setup_python and install wheels actions
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        timeout-minutes: 15
+        with:
+          sparse-checkout: |
+            .github/actions/setup_python/action.yml
+            .github/actions/install_ov_wheels/action.yml
+          sparse-checkout-cone-mode: false
+          path: 'openvino'
+
+      - name: Setup Python ${{ inputs.python-version }}
+        uses: ./openvino/.github/actions/setup_python
+        with:
+          version: ${{ inputs.python-version }}
+          pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH_LINUX || env.PIP_CACHE_PATH_WIN }}
+          should-setup-pip-paths: ${{ runner.os != 'macOS' }}
+          self-hosted-runner: ${{ runner.os != 'macOS' }}
+
+      - name: Install OpenVINO Python wheels
+        uses: ./openvino/.github/actions/install_ov_wheels
+        with:
+          wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }}
+          wheels-to-install: 'openvino'
+
+      - name: Install JAX Layer tests dependencies
+        run: |
+          # jax test requirements
+          python3 -m pip install -r ${{ env.INSTALL_TEST_DIR }}/requirements_jax
+
+      - name: JAX Layer Tests
+        if: ${{ fromJSON(inputs.affected-components).JAX_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287, 142196
+        run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/jax_tests ${PARALLEL} -m precommit_jax_fe --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-jax.xml
+        env:
+          TEST_DEVICE: CPU
+          TEST_PRECISION: FP16
+          JAX_TRACE_MODE: JAXPR
+          PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}}
+
+      - name: Upload Test Results
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        if: ${{ !cancelled() }}
+        with:
+          name: test-results-python-jax-layers
+          path: |
+            ${{ env.INSTALL_TEST_DIR }}/TEST*.html
+            ${{ env.INSTALL_TEST_DIR }}/TEST*.xml
+          if-no-files-found: 'warn'
diff --git a/.github/workflows/job_jax_models_tests.yml b/.github/workflows/job_jax_models_tests.yml
index 07155db1016057..57eb07a83aa423 100644
--- a/.github/workflows/job_jax_models_tests.yml
+++ b/.github/workflows/job_jax_models_tests.yml
@@ -89,7 +89,7 @@ jobs:
 
       - name: Install JAX tests requirements for precommit
         run: |
-          python3 -m pip install -r ${MODEL_HUB_TESTS_INSTALL_DIR}/jax/requirements.txt
+          python3 -m pip install -r ${{ env.INSTALL_TEST_DIR }}/requirements_jax
 
       - name: JAX/Flax Models Tests from Hugging Face
         if: ${{ inputs.model_scope == 'precommit' || inputs.model_scope == 'nightly' }}
diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml
index b04f719c8e296f..e1532d530ff2db 100644
--- a/.github/workflows/job_python_unit_tests.yml
+++ b/.github/workflows/job_python_unit_tests.yml
@@ -162,14 +162,6 @@ jobs:
           export LD_LIBRARY_PATH=${PIP_INSTALL_PATH}/openvino/libs:$LD_LIBRARY_PATH
           python3 -m pytest ${LAYER_TESTS_INSTALL_DIR}/py_frontend_tests --junitxml=${INSTALL_TEST_DIR}/TEST-test_py_fontend.xml
 
-      - name: JAX Layer Tests - JAX FE
-        if: ${{ fromJSON(inputs.affected-components).JAX_FE.test && runner.arch != 'ARM64' && runner.os != 'macOS' }}
-        run: python3 -m pytest ${LAYER_TESTS_INSTALL_DIR}/jax_tests/ -m precommit_jax_fe --junitxml=${INSTALL_TEST_DIR}/TEST-jax_fe.xml
-        env:
-          TEST_DEVICE: CPU
-          TEST_PRECISION: FP16
-          JAX_TRACE_MODE: JAXPR
-
       - name: TensorFlow Lite Layer Tests - TFL FE
         if: fromJSON(inputs.affected-components).TFL_FE.test
         run: python3 -m pytest ${LAYER_TESTS_INSTALL_DIR}/tensorflow_lite_tests/ -n logical --junitxml=${INSTALL_TEST_DIR}/TEST-tfl_fe.xml
diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml
index 66e825e5d5e126..ca1ca6e056e23d 100644
--- a/.github/workflows/linux_arm64.yml
+++ b/.github/workflows/linux_arm64.yml
@@ -202,6 +202,16 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.11'
 
+  JAX_Layer_Tests:
+    name: JAX Layer Tests
+    needs: [ Build, Docker, Smart_CI ]
+    uses: ./.github/workflows/job_jax_layer_tests.yml
+    with:
+      runner: 'aks-linux-16-cores-32gb-arm'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
+      affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
+
   CPU_Functional_Tests:
     name: CPU functional tests
     if: fromJSON(needs.smart_ci.outputs.affected_components).CPU.test
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 5e4335b8151c02..0fbc20cf19594b 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -356,6 +356,15 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.11'
 
+  JAX_Layer_Tests:
+    name: JAX Layer Tests
+    needs: [ Build, Smart_CI ]
+    uses: ./.github/workflows/job_jax_layer_tests.yml
+    with:
+      runner: 'macos-13'
+      affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
+
   CPU_Functional_Tests:
     name: CPU functional tests
     # if: fromJSON(needs.smart_ci.outputs.affected_components).CPU.test
diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml
index 855d76973cc2e4..b60daefa442c83 100644
--- a/.github/workflows/mac_arm64.yml
+++ b/.github/workflows/mac_arm64.yml
@@ -355,6 +355,15 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.11'
 
+  JAX_Layer_Tests:
+    name: JAX Layer Tests
+    needs: [ Build, Smart_CI ]
+    uses: ./.github/workflows/job_jax_layer_tests.yml
+    with:
+      runner: 'macos-13-xlarge'
+      affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
+
   CPU_Functional_Tests:
     name: CPU functional tests
     if: fromJSON(needs.smart_ci.outputs.affected_components).CPU.test
diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml
index 5aed74bbb242b8..e5c7d25003de1e 100644
--- a/.github/workflows/ubuntu_22.yml
+++ b/.github/workflows/ubuntu_22.yml
@@ -334,6 +334,16 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.11'
 
+  JAX_Layer_Tests:
+    name: JAX Layer Tests
+    needs: [ Docker, Build, Smart_CI ]
+    uses: ./.github/workflows/job_jax_layer_tests.yml
+    with:
+      runner: 'aks-linux-4-cores-16gb'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
+      affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
+
   CPU_Functional_Tests:
     name: CPU functional tests
     if: fromJSON(needs.smart_ci.outputs.affected_components).CPU.test
diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml
index 25be095e692d35..beac15bfbda97d 100644
--- a/.github/workflows/ubuntu_24.yml
+++ b/.github/workflows/ubuntu_24.yml
@@ -156,6 +156,16 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.12'
 
+  JAX_Layer_Tests:
+    name: JAX Layer Tests
+    needs: [ Docker, Build, Smart_CI ]
+    uses: ./.github/workflows/job_jax_layer_tests.yml
+    with:
+      runner: 'aks-linux-4-cores-16gb'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}'
+      affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.12'
+
   TensorFlow_Layer_Tests:
     name: TensorFlow Layer Tests
     needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ]
diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml
index f1fd0be596baa2..de33f2603d7430 100644
--- a/.github/workflows/windows_vs2019_release.yml
+++ b/.github/workflows/windows_vs2019_release.yml
@@ -499,6 +499,15 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.11'
 
+  JAX_Layer_Tests:
+    name: JAX Layer Tests
+    needs: [ Build, Smart_CI ]
+    uses: ./.github/workflows/job_jax_layer_tests.yml
+    with:
+      runner: 'aks-win-8-cores-16gb'
+      affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
+
   CXX_Unit_Tests:
     name: C++ unit tests
     needs: [ Build, Smart_CI ]
diff --git a/.github/workflows/workflow_rerunner.yml b/.github/workflows/workflow_rerunner.yml
index 0d8d6610bea588..535101ec943264 100644
--- a/.github/workflows/workflow_rerunner.yml
+++ b/.github/workflows/workflow_rerunner.yml
@@ -29,7 +29,7 @@ jobs:
     name: Rerun Workflow
     # Run only for the failed workflows in openvinotoolkit org
     if: ${{ github.event.workflow_run.conclusion == 'failure' && github.repository_owner == 'openvinotoolkit' }}
-    runs-on: aks-linux-2-cores-8gb
+    runs-on: aks-linux-2-cores-8gb-stats
     permissions:
       actions: write
       contents: read
@@ -70,7 +70,7 @@ jobs:
   rerunner_tests:
     name: Rerunner Tests
     if: ${{ github.event_name == 'pull_request' && github.repository_owner == 'openvinotoolkit' }}
-    runs-on: aks-linux-2-cores-8gb
+    runs-on: aks-linux-2-cores-8gb-stats
     steps:
       - name: Checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -96,9 +96,9 @@ jobs:
         working-directory: ${{ github.workspace }}/.github/scripts/workflow_rerun
         run: |
           export PYTHONPATH=${{ github.workspace }}/.github/scripts/workflow_rerun:${{ github.workspace }}/.github/scripts:$PYTHONPATH
-          
+
           # Need to get a run id with successful status for log analyzing
           # cannot lock a run id as logs get deleted after some time
           run_id=$(python3 -c "from github import Github, Auth; import os; github=Github(auth=Auth.Token(token=os.environ.get('GITHUB_TOKEN'))); repo = github.get_repo('${GITHUB_REPOSITORY}'); run_id = repo.get_workflow_runs(status='success')[0].id; print(run_id)")
-          
+
           python3 rerunner.py --repository-name ${GITHUB_REPOSITORY} --run-id $run_id --dry-run
diff --git a/docs/articles_en/about-openvino/compatibility-and-support/supported-models.rst b/docs/articles_en/about-openvino/compatibility-and-support/supported-models.rst
index d877cb1768d44d..f4ec275491fa32 100644
--- a/docs/articles_en/about-openvino/compatibility-and-support/supported-models.rst
+++ b/docs/articles_en/about-openvino/compatibility-and-support/supported-models.rst
@@ -6,16 +6,14 @@ models from OpenVINO-supported frameworks may also work properly but have not be
 
 **AI Models that run on Intel® Core Ultra™ Processors with OpenVINO™ toolkit:**
 
-.. raw:: html
-
-   <link rel="stylesheet" type="text/css" href="../../_static/css/openVinoDataTables.css">
-
-
-.. csv-table::
+.. data-table:: 
    :class: modeldata stripe
    :name: supportedModelsTable
    :header-rows: 1
    :file:  ../../_static/download/supported_models.csv
+   :data-column-hidden: []
+   :data-order: [[ 0, "asc" ]]
+   :data-page-length: 10
 
 
 | Marked cells indicate models that passed inference with no errors. Empty cells indicate
diff --git a/docs/articles_en/about-openvino/compatibility-and-support/supported-operations.rst b/docs/articles_en/about-openvino/compatibility-and-support/supported-operations.rst
index d27f7626391f46..1bd8f5dae7c634 100644
--- a/docs/articles_en/about-openvino/compatibility-and-support/supported-operations.rst
+++ b/docs/articles_en/about-openvino/compatibility-and-support/supported-operations.rst
@@ -41,27 +41,36 @@ Data as of OpenVINO 2024.4, 18 Oct. 2024.
 
    .. tab-item:: PyTorch
 
-      .. csv-table::
+      .. data-table::
          :class: modeldata stripe
-         :name: TensorFlow ops
+         :name: TensorFlow_ops_v1
          :header-rows: 1
          :file:  ../../_static/conformance_files/pytorch_ops.csv
+         :data-column-hidden: []
+         :data-order: [[ 0, "asc" ]]
+         :data-page-length: 10
 
    .. tab-item:: TensorFlow
 
-      .. csv-table::
+      .. data-table::
          :class: modeldata stripe
-         :name: TensorFlow ops
+         :name: TensorFlow_ops_v2
          :header-rows: 1
          :file:  ../../_static/conformance_files/tensorflow_ops.csv
+         :data-column-hidden: []
+         :data-order: [[ 0, "asc" ]]
+         :data-page-length: 10
 
    .. tab-item:: PaddlePaddle
 
-      .. csv-table::
+      .. data-table::
          :class: modeldata stripe
-         :name: Paddle ops
+         :name: Paddle_ops
          :header-rows: 1
          :file:  ../../_static/conformance_files/paddlepaddle_ops.csv
+         :data-column-hidden: []
+         :data-order: [[ 0, "asc" ]]
+         :data-page-length: 10
 
    .. tab-item:: ONNX
 
diff --git a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst
index 085a1ff8449151..83581d465df92e 100644
--- a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst
+++ b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst
@@ -8,10 +8,6 @@ The current data is as of OpenVINO 2024.4, 20 Nov. 2024.
 The tables below list the key performance indicators for inference on built-in GPUs.
 
 
-.. raw:: html
-
-   <label><link rel="stylesheet" type="text/css" href="../../_static/css/openVinoDataTables.css"></label>
-
 
 .. tab-set::
 
@@ -22,7 +18,9 @@ The tables below list the key performance indicators for inference on built-in G
          :name: supportedModelsTable_V1
          :header-rows: 1
          :file:  ../../_static/benchmarks_files/llm_models_9-288V.csv
-         :hidden: [3,4,6]
+         :data-column-hidden: [3,4,6]
+         :data-order: [[ 0, "asc" ]]
+         :data-page-length: 10
 
    .. tab-item:: 7-268V
 
@@ -31,7 +29,8 @@ The tables below list the key performance indicators for inference on built-in G
          :name: supportedModelsTable_V2
          :header-rows: 1
          :file:  ../../_static/benchmarks_files/llm_models_7-258V.csv
-         :hidden: [3,4,6]
+         :data-column-hidden: [3,4,6]
+         :data-order: [[ 0, "asc" ]]
 
    .. tab-item:: 7-155H
 
@@ -40,7 +39,8 @@ The tables below list the key performance indicators for inference on built-in G
          :name: supportedModelsTable_V3
          :header-rows: 1
          :file:  ../../_static/benchmarks_files/llm_models_7-155H.csv
-         :hidden: [3,4,6]
+         :data-column-hidden: [3,4,6]
+         :data-order: [[ 0, "asc" ]]
 
 
 .. grid:: 1 1 2 2
diff --git a/docs/articles_en/openvino-workflow/model-preparation/convert-model-pytorch.rst b/docs/articles_en/openvino-workflow/model-preparation/convert-model-pytorch.rst
index 6ac806daf0cda0..62cfdf05f2b11f 100644
--- a/docs/articles_en/openvino-workflow/model-preparation/convert-model-pytorch.rst
+++ b/docs/articles_en/openvino-workflow/model-preparation/convert-model-pytorch.rst
@@ -203,6 +203,52 @@ Here is an example of how to convert a model obtained with ``torch.export``:
    This is an experimental feature. Use it only if you know that you need to. PyTorch version 2.2
    is recommended. Dynamic shapes are not supported yet.
 
+Converting a PyTorch Model from Disk
+####################################
+
+PyTorch provides the capability to save models in two distinct formats: ``torch.jit.ScriptModule`` and ``torch.export.ExportedProgram``.
+Both formats can be saved to disk as standalone files, enabling them to be reloaded independently of the original Python code.
+
+ExportedProgram Format
+++++++++++++++++++++++
+
+The ``ExportedProgram`` format is saved on disk using `torch.export.save() <https://pytorch.org/docs/stable/export.html#serialization>`__.
+Below is an example of how to convert an ``ExportedProgram`` from disk:
+
+.. tab-set::
+
+   .. tab-item:: Python
+      :sync: py
+
+      .. code-block:: py
+         :force:
+
+         import openvino as ov
+         ov_model = ov.convert_model('exported_program.pt2')
+
+   .. tab-item:: CLI
+      :sync: cli
+
+      .. code-block:: sh
+
+         ovc exported_program.pt2
+
+ScriptModule Format
++++++++++++++++++++
+
+`torch.jit.save() <https://pytorch.org/docs/stable/generated/torch.jit.save.html>`__ serializes ``ScriptModule`` object on disk.
+To convert the serialized ``ScriptModule`` format, run ``convert_model`` function with ``example_input`` parameter as follows:
+
+.. code-block:: py
+   :force:
+
+   from openvino import convert_model
+   import torch
+
+   convert_model(input_model='script_module.pt', example_input=torch.rand(1, 10))
+
+``example_input`` is the required parameter for the conversion because ``torch.jit.ScriptModule`` object is always saved in an untraced state on disk.
+
 Exporting a PyTorch Model to ONNX Format
 ########################################
 
diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py b/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py
index c3e0e81eec3b3a..814517289ce114 100644
--- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py
+++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py
@@ -11,7 +11,7 @@
 import requests
 import re
 import json
-
+import html
 import csv
 
 logger = logging.getLogger(__name__)
@@ -147,7 +147,9 @@ class DataTable(Directive):
                    'file': directives.path,
                    'class': directives.unchanged,
                    'name': directives.unchanged,
-                   'hidden': directives.unchanged
+                   'data-column-hidden': directives.unchanged,
+                   'data-page-length': directives.unchanged,
+                   'data-order': directives.unchanged
                    }
 
     def run(self) -> List[Node]:
@@ -159,10 +161,12 @@ def run(self) -> List[Node]:
         csv_node = []
         with open(csv_file, 'r') as j:
             csv_data = list(csv.reader(j))
-            class_table_tag = ' class="' + "".join(c for c in str(self.options['class']) + '"') if 'class' in self.options is not None else ""
-            id_table_tag = ' id="' + "".join(c for c in str(self.options['name']) + '"') if 'name' in self.options is not None else ""
-            hidden_table_tag = ' data-columns-hidden="' + "".join(c for c in str(self.options['hidden']) + '"') if 'hidden' in self.options is not None else ""
-            csv_table_html = '<table' + class_table_tag + id_table_tag + hidden_table_tag + '>'
+            class_table_tag = f' class="{html.escape(self.options["class"])}"' if "class" in self.options else ""
+            id_table_tag = f' id="{html.escape(self.options["name"])}"' if "name" in self.options else ""
+            data_column_hidden_tag = f' data-column-hidden="{html.escape(self.options["data-column-hidden"])}"' if "data-column-hidden" in self.options else ""
+            data_order_tag = f' data-order="{html.escape(self.options["data-order"])}"' if "data-order" in self.options else ""
+            data_page_length_tag = f' data-page-length="{html.escape(self.options["data-page-length"])}"' if "data-page-length" in self.options else ""
+            csv_table_html = f'<table{class_table_tag}{id_table_tag}{data_column_hidden_tag}{data_order_tag}{data_page_length_tag}>'
             head_rows = 0
             head_rows += self.options.get('header-rows', 0)
             row_count = 0
diff --git a/docs/sphinx_setup/_static/css/custom.css b/docs/sphinx_setup/_static/css/custom.css
index de8a05732a4d06..1679f7309da044 100644
--- a/docs/sphinx_setup/_static/css/custom.css
+++ b/docs/sphinx_setup/_static/css/custom.css
@@ -69,7 +69,7 @@ a#wap_dns {
 /* Sphinx-design tabs override */
 .sd-tab-set>input:checked+label {
     color: var(--sd-color-black) !important;
-    background-color: #f8f8f8 !important;
+    background-color: white !important;
     border: solid 1px #bdbdbd;
     border-bottom: solid 0px;
     margin-bottom: -1px;
@@ -96,7 +96,7 @@ a#wap_dns {
     cursor: pointer;
     font-size: var(--sd-fontsize-tabs-label);
     font-weight: 400 !important;
-    padding: 5px 16px 2px !important;
+    padding: 5px 16px 0px !important;
     transition: color 250ms;
     width: auto;
     z-index: 1;
@@ -110,7 +110,6 @@ a#wap_dns {
     box-shadow: 0 0 0 0;
     border: solid 1px var(--sd-color-tabs-overline);
     border-color: #bdbdbd;
-    background-color: #f8f8f8;
     padding-right: 4px;
     padding-left: 4px;
     padding-bottom: 6px;
diff --git a/docs/sphinx_setup/_static/css/openVinoDataTables.css b/docs/sphinx_setup/_static/css/openVinoDataTables.css
index 526aabb6abe15d..bedc0f5206e260 100644
--- a/docs/sphinx_setup/_static/css/openVinoDataTables.css
+++ b/docs/sphinx_setup/_static/css/openVinoDataTables.css
@@ -6,8 +6,7 @@ div.dt-buttons>.dt-button, div.dt-buttons>div.dt-button-split .dt-button {
 }
 
 div.dt-container .dt-paging .dt-paging-button:hover {
-    color: white !important;
-    border: 1px solid #aaa;
+    border: 1px solid #aaa !important;
     background:none !important;
     background-color: var(--bttn-act-bg-hover) !important
 }
@@ -190,10 +189,9 @@ div.dt-container .dt-paging .dt-paging-button {
 
 div.dt-container .dt-paging .dt-paging-button.current, div.dt-container .dt-paging .dt-paging-button.current:hover {
     background: none !important;
-    background-color: var(--bttn-act-bg-active) !important;
+    background-color: var(--bttn-sec-border-color) !important;
     border-color: var(--bttn-act-bg-active) !important;
     border-radius: 0px !important;
-    color: white !important;
     border: 1px !important
 }
 table.dataTable thead>tr>th.dt-orderable-asc span.dt-column-order:before, table.dataTable thead>tr>th.dt-orderable-asc span.dt-column-order:after, table.dataTable thead>tr>th.dt-orderable-desc span.dt-column-order:before, table.dataTable thead>tr>th.dt-orderable-desc span.dt-column-order:after, table.dataTable thead>tr>th.dt-ordering-asc span.dt-column-order:before, table.dataTable thead>tr>th.dt-ordering-asc span.dt-column-order:after, table.dataTable thead>tr>th.dt-ordering-desc span.dt-column-order:before, table.dataTable thead>tr>th.dt-ordering-desc span.dt-column-order:after, table.dataTable thead>tr>td.dt-orderable-asc span.dt-column-order:before, table.dataTable thead>tr>td.dt-orderable-asc span.dt-column-order:after, table.dataTable thead>tr>td.dt-orderable-desc span.dt-column-order:before, table.dataTable thead>tr>td.dt-orderable-desc span.dt-column-order:after, table.dataTable thead>tr>td.dt-ordering-asc span.dt-column-order:before, table.dataTable thead>tr>td.dt-ordering-asc span.dt-column-order:after, table.dataTable thead>tr>td.dt-ordering-desc span.dt-column-order:before, table.dataTable thead>tr>td.dt-ordering-desc span.dt-column-order:after {
diff --git a/docs/sphinx_setup/_static/js/openVinoDataTables.js b/docs/sphinx_setup/_static/js/openVinoDataTables.js
index bd56a71533786c..fb3a57d959020c 100644
--- a/docs/sphinx_setup/_static/js/openVinoDataTables.js
+++ b/docs/sphinx_setup/_static/js/openVinoDataTables.js
@@ -1,16 +1,15 @@
 $(document).ready(function () {
   var columnDefs = [];
-
   var tables = $('table.modeldata');
   for (let table of tables) {
-    var hidden = table.getAttribute('data-columns-hidden');
+    var hidden = table.getAttribute('data-column-hidden');
     columnDefs = [{ "visible": false, "targets": JSON.parse(hidden) }]
     $(table).DataTable({
       responsive: true,
       "autoWidth": false,
       language: {
         buttons: {
-          colvisRestore: "Restore default"
+          colvisRestore: "Restore default selection"
         }
       },
       lengthMenu: [
diff --git a/docs/sphinx_setup/_templates/layout.html b/docs/sphinx_setup/_templates/layout.html
index 0d2331b2c83fe3..a791091e1f13a4 100644
--- a/docs/sphinx_setup/_templates/layout.html
+++ b/docs/sphinx_setup/_templates/layout.html
@@ -9,6 +9,7 @@
 <link rel="stylesheet" href="{{ pathto('_static/css/viewer.min.css', 1) }}" type="text/css" />
 <link rel="stylesheet" href="{{ pathto('_static/css/custom.css', 1) }}" type="text/css" />
 <link rel="stylesheet" href="{{ pathto('_static/css/banner.css', 1) }}" type="text/css" />
+<link rel="stylesheet" href="{{ pathto('_static/css/openVinoDataTables.css', 1) }}" type="text/css" />
 <link href="https://cdn.datatables.net/v/dt/jq-3.7.0/dt-2.0.8/b-3.0.2/b-colvis-3.0.2/b-print-3.0.2/datatables.min.css" rel="stylesheet">
 <link rel="stylesheet" href="{{ pathto('_static/css/coveo_custom.css', 1) }}" type="text/css" />
 <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.7/dist/chart.umd.min.js"></script>
diff --git a/src/bindings/python/src/openvino/__init__.py b/src/bindings/python/src/openvino/__init__.py
index 7643f742e0067d..69c678909b1c9e 100644
--- a/src/bindings/python/src/openvino/__init__.py
+++ b/src/bindings/python/src/openvino/__init__.py
@@ -7,7 +7,7 @@
 # Required for Windows OS platforms
 # Note: always top-level
 try:
-    from openvino.package_utils import _add_openvino_libs_to_search_path
+    from openvino.utils import _add_openvino_libs_to_search_path
     _add_openvino_libs_to_search_path()
 except ImportError:
     pass
@@ -17,47 +17,6 @@
 # # This __init__.py forces checking of runtime modules to propagate errors.
 # # It is not compared with init files from openvino-dev package.
 # #
-
-# Openvino pybind bindings
-from openvino._pyopenvino import AxisSet
-from openvino._pyopenvino import AxisVector
-from openvino._pyopenvino import ConstOutput
-from openvino._pyopenvino import Coordinate
-from openvino._pyopenvino import CoordinateDiff
-from openvino._pyopenvino import DiscreteTypeInfo
-from openvino._pyopenvino import Extension
-from openvino._pyopenvino import ProfilingInfo
-from openvino._pyopenvino import RTMap
-from openvino._pyopenvino import Version
-from openvino._pyopenvino import Symbol
-from openvino._pyopenvino import Dimension
-from openvino._pyopenvino import Input
-from openvino._pyopenvino import Output
-from openvino._pyopenvino import Node
-from openvino._pyopenvino import Strides
-from openvino._pyopenvino import PartialShape
-from openvino._pyopenvino import Shape
-from openvino._pyopenvino import Layout
-from openvino._pyopenvino import Type
-from openvino._pyopenvino import Tensor
-from openvino._pyopenvino import OVAny
-from openvino._pyopenvino import get_batch
-from openvino._pyopenvino import set_batch
-from openvino._pyopenvino import serialize
-from openvino._pyopenvino import shutdown
-from openvino._pyopenvino import save_model
-from openvino._pyopenvino import layout_helpers
-from openvino._pyopenvino import RemoteContext
-from openvino._pyopenvino import RemoteTensor
-from openvino._pyopenvino import Op
-
-# Import public classes from _ov_api
-from openvino._ov_api import Model
-from openvino._ov_api import Core
-from openvino._ov_api import CompiledModel
-from openvino._ov_api import InferRequest
-from openvino._ov_api import AsyncInferQueue
-
 # Import all public modules
 from openvino import runtime as runtime
 from openvino import frontend as frontend
@@ -67,10 +26,36 @@
 from openvino import utils as utils
 from openvino import properties as properties
 
+# Import most important classes and functions from openvino.runtime
+from openvino._ov_api import Model
+from openvino._ov_api import Core
+from openvino._ov_api import CompiledModel
+from openvino._ov_api import InferRequest
+from openvino._ov_api import AsyncInferQueue
+
+from openvino.runtime import Symbol
+from openvino.runtime import Dimension
+from openvino.runtime import Strides
+from openvino.runtime import PartialShape
+from openvino.runtime import Shape
+from openvino.runtime import Layout
+from openvino.runtime import Type
+from openvino.runtime import Tensor
+from openvino.runtime import OVAny
+
 # Helper functions for openvino module
-from openvino.utils.data_helpers import tensor_from_file
+from openvino.runtime.utils.data_helpers import tensor_from_file
 from openvino._ov_api import compile_model
+from openvino.runtime import get_batch
+from openvino.runtime import set_batch
+from openvino.runtime import serialize
+from openvino.runtime import shutdown
+from openvino.runtime import save_model
+from openvino.runtime import layout_helpers
 
+from openvino._pyopenvino import RemoteContext
+from openvino._pyopenvino import RemoteTensor
+from openvino._pyopenvino import Op
 
 # Import opsets
 from openvino import opset1
@@ -95,7 +80,7 @@
 from openvino._pyopenvino import VASurfaceTensor
 
 # Set version for openvino package
-from openvino._pyopenvino import get_version
+from openvino.runtime import get_version
 __version__ = get_version()
 
 # Tools
diff --git a/src/bindings/python/src/openvino/_ov_api.py b/src/bindings/python/src/openvino/_ov_api.py
index da31fab4c95d8e..53d0fa5316498b 100644
--- a/src/bindings/python/src/openvino/_ov_api.py
+++ b/src/bindings/python/src/openvino/_ov_api.py
@@ -5,7 +5,9 @@
 from types import TracebackType
 from typing import Any, Iterable, Union, Optional, Dict, Type
 from pathlib import Path
+import warnings
 
+import numpy as np
 
 from openvino._pyopenvino import Model as ModelBase
 from openvino._pyopenvino import Core as CoreBase
@@ -14,7 +16,7 @@
 from openvino._pyopenvino import Tensor
 from openvino._pyopenvino import Node
 
-from openvino.utils.data_helpers import (
+from openvino.runtime.utils.data_helpers import (
     OVDict,
     _InferRequestWrapper,
     _data_dispatch,
diff --git a/src/bindings/python/src/openvino/frontend/frontend.py b/src/bindings/python/src/openvino/frontend/frontend.py
index 6a16d5a573b7d7..4d549d24b4ef7c 100644
--- a/src/bindings/python/src/openvino/frontend/frontend.py
+++ b/src/bindings/python/src/openvino/frontend/frontend.py
@@ -7,7 +7,7 @@
 from openvino._pyopenvino import FrontEnd as FrontEndBase
 from openvino._pyopenvino import FrontEndManager as FrontEndManagerBase
 from openvino._pyopenvino import InputModel
-from openvino import Model
+from openvino.runtime import Model
 
 
 class FrontEnd(FrontEndBase):
diff --git a/src/bindings/python/src/openvino/frontend/jax/jaxpr_decoder.py b/src/bindings/python/src/openvino/frontend/jax/jaxpr_decoder.py
index 9072598f824939..914f6b2e2ee548 100644
--- a/src/bindings/python/src/openvino/frontend/jax/jaxpr_decoder.py
+++ b/src/bindings/python/src/openvino/frontend/jax/jaxpr_decoder.py
@@ -6,7 +6,7 @@
 
 import jax.core
 from openvino.frontend.jax.py_jax_frontend import _FrontEndJaxDecoder as Decoder
-from openvino import PartialShape, Type as OVType, OVAny
+from openvino.runtime import PartialShape, Type as OVType, OVAny
 from openvino.frontend.jax.utils import jax_array_to_ov_const, get_ov_type_for_value, \
     ivalue_to_constant, param_to_constants
 
diff --git a/src/bindings/python/src/openvino/frontend/jax/utils.py b/src/bindings/python/src/openvino/frontend/jax/utils.py
index 659677b11d5af8..4535265d6de082 100644
--- a/src/bindings/python/src/openvino/frontend/jax/utils.py
+++ b/src/bindings/python/src/openvino/frontend/jax/utils.py
@@ -8,7 +8,7 @@
 import jax.numpy as jnp
 import numpy as np
 from openvino.frontend.jax.passes import filter_element, filter_ivalue, filter_param
-from openvino import op, Type as OVType, Shape, OVAny
+from openvino.runtime import op, Type as OVType, Shape, OVAny
 
 numpy_to_ov_type_map = {
     np.float32: OVType.f32,
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
index 81a2764ee1188d..c448571f1ac17a 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
@@ -10,7 +10,7 @@
 
 from openvino.frontend.pytorch.py_pytorch_frontend import _FrontEndPytorchDecoder as Decoder
 from openvino.frontend.pytorch.py_pytorch_frontend import _Type as DecoderType
-from openvino import PartialShape, Type as OVType, OVAny, Shape
+from openvino.runtime import PartialShape, Type as OVType, OVAny, Shape
 from openvino.frontend.pytorch.utils import make_constant, fetch_attr, pt_to_ov_type_map, torch_tensor_to_ov_const
 
 logger = logging.getLogger(__name__)
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py
index a9a65781dcb254..9f2ef019769875 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py
@@ -18,7 +18,7 @@
 from torch._decomp import decomposition_table, get_decompositions
 
 from openvino.frontend import FrontEndManager
-from openvino import Core, Type, PartialShape
+from openvino.runtime import Core, Type, PartialShape
 from openvino.frontend.pytorch.ts_decoder import TorchScriptPythonDecoder
 from openvino.frontend.pytorch.torchdynamo import decompositions
 from openvino.frontend.pytorch.torchdynamo.decompositions import get_aot_decomposition_list, get_inf_decomposition_list
@@ -27,7 +27,7 @@
 from openvino.frontend.pytorch.torchdynamo.compile import cached_model_name, openvino_compile_cached_model
 from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device, _get_model_caching, _get_decompositions, _get_aot_autograd
 
-from openvino import Core, Type, PartialShape
+from openvino.runtime import Core, Type, PartialShape
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.WARNING)
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend_utils.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend_utils.py
index c9a772b3feac42..47b3b82806b18b 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend_utils.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend_utils.py
@@ -5,7 +5,7 @@
 # mypy: ignore-errors
 
 from typing import Optional, Any
-from openvino import Core
+from openvino.runtime import Core
 
 
 def _get_device(options) -> Optional[Any]:
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py
index ca8d5478e76c15..fa446893a05d07 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py
@@ -14,7 +14,7 @@
 
 from openvino.frontend import FrontEndManager
 from openvino.frontend.pytorch.fx_decoder import TorchFXPythonDecoder
-from openvino import Core, Type, PartialShape, serialize
+from openvino.runtime import Core, Type, PartialShape, serialize
 from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device, _get_config, _is_cache_dir_in_config
 
 from typing import Callable, Optional
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py
index 7527ad7acb37a4..4f41f7b5a6a9de 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py
@@ -20,7 +20,7 @@
 from openvino.frontend.pytorch.fx_decoder import TorchFXPythonDecoder
 from openvino.frontend.pytorch.torchdynamo.partition import Partitioner
 from openvino.frontend.pytorch.torchdynamo.compile import openvino_compile
-from openvino import Core, Type, PartialShape
+from openvino.runtime import Core, Type, PartialShape
 from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device, _get_aot_autograd
 
 from typing import Callable, Optional, Any
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py
index 7bb8073167a654..6d8fdb1658793e 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py
@@ -6,7 +6,7 @@
 
 from openvino.frontend.pytorch.py_pytorch_frontend import _FrontEndPytorchDecoder as Decoder
 from openvino.frontend.pytorch.py_pytorch_frontend import _Type as DecoderType
-from openvino import op, PartialShape, Type as OVType, OVAny
+from openvino.runtime import op, PartialShape, Type as OVType, OVAny
 from openvino.frontend.pytorch.utils import (
     ivalue_to_constant,
     get_value_from_getattr,
@@ -15,7 +15,7 @@
     convert_quantized_tensor,
     graph_has_ops,
 )
-from openvino import opset11 as ops
+from openvino.runtime import opset11 as ops
 from openvino.frontend.pytorch import quantized, patch_model
 from openvino.frontend.pytorch.module_extension import ModuleExtension
 
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/utils.py b/src/bindings/python/src/openvino/frontend/pytorch/utils.py
index 9ba36707037c9e..826d766505fa79 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/utils.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/utils.py
@@ -7,8 +7,8 @@
 import torch
 import numpy as np
 
-from openvino import op, Type as OVType, Shape, Tensor
-from openvino import opset11 as ops
+from openvino.runtime import op, Type as OVType, Shape, Tensor
+from openvino.runtime import opset11 as ops
 
 
 def make_constant(*args, **kwargs):
diff --git a/src/bindings/python/src/openvino/frontend/tensorflow/node_decoder.py b/src/bindings/python/src/openvino/frontend/tensorflow/node_decoder.py
index d15262cbc30366..fcedd7a74c2b51 100644
--- a/src/bindings/python/src/openvino/frontend/tensorflow/node_decoder.py
+++ b/src/bindings/python/src/openvino/frontend/tensorflow/node_decoder.py
@@ -7,7 +7,7 @@
 import numpy as np
 import tensorflow as tf
 from openvino.frontend.tensorflow.py_tensorflow_frontend import _FrontEndDecoderBase as DecoderBase
-from openvino import PartialShape, Type, OVAny, Tensor
+from openvino.runtime import PartialShape, Type, OVAny, Tensor
 
 
 def tf_type_to_ov_type(tf_type_int):
diff --git a/src/bindings/python/src/openvino/frontend/tensorflow/utils.py b/src/bindings/python/src/openvino/frontend/tensorflow/utils.py
index 7de5dc950be53e..74c0dfff92297e 100644
--- a/src/bindings/python/src/openvino/frontend/tensorflow/utils.py
+++ b/src/bindings/python/src/openvino/frontend/tensorflow/utils.py
@@ -8,7 +8,7 @@
 import logging as log
 import numpy as np
 import sys
-from openvino import PartialShape, Dimension, Type
+from openvino.runtime import PartialShape, Dimension, Type
 from packaging.version import parse, Version
 from typing import List, Dict, Union
 
diff --git a/src/bindings/python/src/openvino/helpers/packing.py b/src/bindings/python/src/openvino/helpers/packing.py
index d0956e09fc6261..796af87402f3a6 100644
--- a/src/bindings/python/src/openvino/helpers/packing.py
+++ b/src/bindings/python/src/openvino/helpers/packing.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from typing import Union
-from openvino import Type, Shape
+from openvino.runtime import Type, Shape
 
 
 def pack_data(array: np.ndarray, type: Type) -> np.ndarray:
diff --git a/src/bindings/python/src/openvino/opset1/ops.py b/src/bindings/python/src/openvino/opset1/ops.py
index e264aea304fb1f..edca6c62a0b246 100644
--- a/src/bindings/python/src/openvino/opset1/ops.py
+++ b/src/bindings/python/src/openvino/opset1/ops.py
@@ -8,17 +8,17 @@
 import numpy as np
 from functools import partial
 
-from openvino import Node, PartialShape, Type
+from openvino.runtime import Node, PartialShape, Type
 from openvino.op import Constant, Parameter, tensor_iterator
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.decorators import binary_op, nameable_op, unary_op
-from openvino.utils.input_validation import (
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op
+from openvino.runtime.utils.input_validation import (
     check_valid_attributes,
     is_non_negative_value,
     is_positive_value,
 )
-from openvino.utils.node_factory import NodeFactory
-from openvino.utils.types import (
+from openvino.runtime.utils.node_factory import NodeFactory
+from openvino.runtime.utils.types import (
     NodeInput,
     NumericData,
     NumericType,
diff --git a/src/bindings/python/src/openvino/opset10/ops.py b/src/bindings/python/src/openvino/opset10/ops.py
index d0bc3cbf1cba4a..c7b75777484a59 100644
--- a/src/bindings/python/src/openvino/opset10/ops.py
+++ b/src/bindings/python/src/openvino/opset10/ops.py
@@ -6,10 +6,10 @@
 from functools import partial
 from typing import List, Optional
 
-from openvino import Node
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.decorators import nameable_op
-from openvino.utils.types import (
+from openvino.runtime import Node
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import nameable_op
+from openvino.runtime.utils.types import (
     NodeInput,
     as_nodes,
     as_node,
diff --git a/src/bindings/python/src/openvino/opset11/ops.py b/src/bindings/python/src/openvino/opset11/ops.py
index 95767b4800db1c..575c99501d2d6c 100644
--- a/src/bindings/python/src/openvino/opset11/ops.py
+++ b/src/bindings/python/src/openvino/opset11/ops.py
@@ -6,10 +6,10 @@
 from functools import partial
 from typing import List, Optional
 
-from openvino import Node
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.decorators import nameable_op
-from openvino.utils.types import (
+from openvino.runtime import Node
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import nameable_op
+from openvino.runtime.utils.types import (
     NodeInput,
     as_nodes,
 )
diff --git a/src/bindings/python/src/openvino/opset12/ops.py b/src/bindings/python/src/openvino/opset12/ops.py
index 4b354b1fcff973..928bf4f71a9773 100644
--- a/src/bindings/python/src/openvino/opset12/ops.py
+++ b/src/bindings/python/src/openvino/opset12/ops.py
@@ -6,10 +6,10 @@
 from functools import partial
 from typing import Optional
 
-from openvino import Node
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.decorators import nameable_op
-from openvino.utils.types import (
+from openvino.runtime import Node
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import nameable_op
+from openvino.runtime.utils.types import (
     NodeInput,
     as_nodes,
     as_node,
diff --git a/src/bindings/python/src/openvino/opset13/ops.py b/src/bindings/python/src/openvino/opset13/ops.py
index 5c6863740120f8..12f0d06b1a28e6 100644
--- a/src/bindings/python/src/openvino/opset13/ops.py
+++ b/src/bindings/python/src/openvino/opset13/ops.py
@@ -11,12 +11,12 @@
 
 log = logging.getLogger(__name__)
 
-from openvino import Node, Shape, Type, Output, Tensor
+from openvino.runtime import Node, Shape, Type, Output, Tensor
 from openvino.op import Constant, Result
 from openvino.opset1 import convert_like
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.decorators import binary_op, nameable_op, unary_op, overloading
-from openvino.utils.types import (
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op, overloading
+from openvino.runtime.utils.types import (
     NumericData,
     NodeInput,
     NumericType,
diff --git a/src/bindings/python/src/openvino/opset14/ops.py b/src/bindings/python/src/openvino/opset14/ops.py
index 59e1bfd3e89c6f..fa872d24eb7f1a 100644
--- a/src/bindings/python/src/openvino/opset14/ops.py
+++ b/src/bindings/python/src/openvino/opset14/ops.py
@@ -7,11 +7,11 @@
 
 from typing import Union, Optional, List
 
-from openvino import Node, Type
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.types import TensorShape
-from openvino.utils.decorators import nameable_op
-from openvino.utils.types import NodeInput, as_node, as_nodes
+from openvino.runtime import Node, Type
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.types import TensorShape
+from openvino.runtime.utils.decorators import nameable_op
+from openvino.runtime.utils.types import NodeInput, as_node, as_nodes
 
 _get_node_factory_opset14 = partial(_get_node_factory, "opset14")
 
diff --git a/src/bindings/python/src/openvino/opset15/ops.py b/src/bindings/python/src/openvino/opset15/ops.py
index 97d4419fc4834b..8e6b8bd46d5f7c 100644
--- a/src/bindings/python/src/openvino/opset15/ops.py
+++ b/src/bindings/python/src/openvino/opset15/ops.py
@@ -7,12 +7,12 @@
 from typing import List, Literal, Optional
 
 import numpy as np
-from openvino import Node, Type
+from openvino.runtime import Node, Type
 from openvino.opset1 import convert_like
 from openvino.opset14 import constant
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.decorators import binary_op, nameable_op
-from openvino.utils.types import NodeInput, as_nodes
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import binary_op, nameable_op
+from openvino.runtime.utils.types import NodeInput, as_nodes
 
 _get_node_factory_opset15 = partial(_get_node_factory, "opset15")
 
diff --git a/src/bindings/python/src/openvino/opset16/ops.py b/src/bindings/python/src/openvino/opset16/ops.py
index e5ebdc7a2a11d6..60656f6d993b6a 100644
--- a/src/bindings/python/src/openvino/opset16/ops.py
+++ b/src/bindings/python/src/openvino/opset16/ops.py
@@ -6,10 +6,10 @@
 from functools import partial
 from typing import Optional
 
-from openvino import Node
-from openvino.utils.decorators import nameable_op
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.types import NodeInput, as_nodes
+from openvino.runtime import Node
+from openvino.runtime.utils.decorators import nameable_op
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.types import NodeInput, as_nodes
 
 _get_node_factory_opset16 = partial(_get_node_factory, "opset16")
 
diff --git a/src/bindings/python/src/openvino/opset2/ops.py b/src/bindings/python/src/openvino/opset2/ops.py
index f76f608fe9a5c7..45b33f5bc0288b 100644
--- a/src/bindings/python/src/openvino/opset2/ops.py
+++ b/src/bindings/python/src/openvino/opset2/ops.py
@@ -9,17 +9,18 @@
 from functools import partial
 import warnings
 
-from openvino import Node, Shape
+from openvino.runtime import Node, Shape
 from openvino.op import Constant, Parameter
-from openvino.utils.decorators import binary_op, nameable_op, unary_op
-from openvino.utils.input_validation import (
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op
+from openvino.runtime.utils.input_validation import (
     assert_list_of_ints,
     check_valid_attributes,
     is_non_negative_value,
     is_positive_value,
 )
-from openvino.utils.node_factory import NodeFactory, _get_node_factory
-from openvino.utils.types import (
+from openvino.runtime.utils.node_factory import NodeFactory
+from openvino.runtime.utils.types import (
     NodeInput,
     NumericData,
     NumericType,
diff --git a/src/bindings/python/src/openvino/opset3/ops.py b/src/bindings/python/src/openvino/opset3/ops.py
index 1c2c7e309fe919..989f5819acb685 100644
--- a/src/bindings/python/src/openvino/opset3/ops.py
+++ b/src/bindings/python/src/openvino/opset3/ops.py
@@ -8,17 +8,18 @@
 import numpy as np
 from functools import partial
 
-from openvino import Node, Shape
+from openvino.runtime import Node, Shape
 from openvino.op import Constant, Parameter
-from openvino.utils.decorators import binary_op, nameable_op, unary_op
-from openvino.utils.input_validation import (
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op
+from openvino.runtime.utils.input_validation import (
     assert_list_of_ints,
     check_valid_attributes,
     is_non_negative_value,
     is_positive_value,
 )
-from openvino.utils.node_factory import NodeFactory, _get_node_factory
-from openvino.utils.types import (
+from openvino.runtime.utils.node_factory import NodeFactory
+from openvino.runtime.utils.types import (
     NodeInput,
     NumericData,
     NumericType,
diff --git a/src/bindings/python/src/openvino/opset4/ops.py b/src/bindings/python/src/openvino/opset4/ops.py
index e6f3a3a1550937..4f6ba016852b02 100644
--- a/src/bindings/python/src/openvino/opset4/ops.py
+++ b/src/bindings/python/src/openvino/opset4/ops.py
@@ -8,17 +8,18 @@
 import numpy as np
 from functools import partial
 
-from openvino import Node, Shape
+from openvino.runtime import Node, Shape
 from openvino.op import Constant, Parameter
-from openvino.utils.decorators import binary_op, nameable_op, unary_op
-from openvino.utils.input_validation import (
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op
+from openvino.runtime.utils.input_validation import (
     assert_list_of_ints,
     check_valid_attributes,
     is_non_negative_value,
     is_positive_value,
 )
-from openvino.utils.node_factory import NodeFactory, _get_node_factory
-from openvino.utils.types import (
+from openvino.runtime.utils.node_factory import NodeFactory
+from openvino.runtime.utils.types import (
     NodeInput,
     NumericData,
     NumericType,
diff --git a/src/bindings/python/src/openvino/opset5/ops.py b/src/bindings/python/src/openvino/opset5/ops.py
index 9217830752b1d8..20057b78c7c31d 100644
--- a/src/bindings/python/src/openvino/opset5/ops.py
+++ b/src/bindings/python/src/openvino/opset5/ops.py
@@ -8,17 +8,18 @@
 import numpy as np
 from functools import partial
 
-from openvino import Node, Shape
+from openvino.runtime import Node, Shape
 from openvino.op import Constant, Parameter, loop
-from openvino.utils.decorators import binary_op, nameable_op, unary_op
-from openvino.utils.input_validation import (
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op
+from openvino.runtime.utils.input_validation import (
     assert_list_of_ints,
     check_valid_attributes,
     is_non_negative_value,
     is_positive_value,
 )
-from openvino.utils.node_factory import NodeFactory, _get_node_factory
-from openvino.utils.types import (
+from openvino.runtime.utils.node_factory import NodeFactory
+from openvino.runtime.utils.types import (
     NodeInput,
     NumericData,
     NumericType,
diff --git a/src/bindings/python/src/openvino/opset6/ops.py b/src/bindings/python/src/openvino/opset6/ops.py
index 340d0405b4ba23..8020715f20dea3 100644
--- a/src/bindings/python/src/openvino/opset6/ops.py
+++ b/src/bindings/python/src/openvino/opset6/ops.py
@@ -9,13 +9,13 @@
 
 from functools import partial, singledispatch
 
-from openvino import Node, Type, PartialShape, Output, Shape
+from openvino.runtime import Node, Type, PartialShape, Output, Shape
 from openvino.op import assign, Constant, Parameter
 from openvino.op import read_value as _read_value
 from openvino.op.util import VariableInfo, Variable
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.decorators import nameable_op, overloading
-from openvino.utils.types import (
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import nameable_op, overloading
+from openvino.runtime.utils.types import (
     NodeInput,
     NumericType,
     TensorShape,
diff --git a/src/bindings/python/src/openvino/opset7/ops.py b/src/bindings/python/src/openvino/opset7/ops.py
index e33d266debedf1..59e09b64888eb1 100644
--- a/src/bindings/python/src/openvino/opset7/ops.py
+++ b/src/bindings/python/src/openvino/opset7/ops.py
@@ -7,17 +7,18 @@
 from typing import Callable, Iterable, List, Optional, Set, Union
 
 import numpy as np
-from openvino import Node, Shape
+from openvino.runtime import Node, Shape
 from openvino.op import Constant, Parameter
-from openvino.utils.decorators import binary_op, nameable_op, unary_op
-from openvino.utils.input_validation import (
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op
+from openvino.runtime.utils.input_validation import (
     assert_list_of_ints,
     check_valid_attributes,
     is_non_negative_value,
     is_positive_value,
 )
-from openvino.utils.node_factory import NodeFactory, _get_node_factory
-from openvino.utils.types import (
+from openvino.runtime.utils.node_factory import NodeFactory
+from openvino.runtime.utils.types import (
     NodeInput,
     NumericData,
     NumericType,
diff --git a/src/bindings/python/src/openvino/opset8/ops.py b/src/bindings/python/src/openvino/opset8/ops.py
index a9a868e7b541d8..6995d55a28a776 100644
--- a/src/bindings/python/src/openvino/opset8/ops.py
+++ b/src/bindings/python/src/openvino/opset8/ops.py
@@ -9,15 +9,15 @@
 import numpy as np
 from openvino.exceptions import UserInputError
 from openvino.op import Constant, Parameter, if_op
-from openvino import Node
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.decorators import nameable_op
-from openvino.utils.input_validation import (
+from openvino.runtime import Node
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import nameable_op
+from openvino.runtime.utils.input_validation import (
     check_valid_attributes,
     is_non_negative_value,
     is_positive_value,
 )
-from openvino.utils.types import (
+from openvino.runtime.utils.types import (
     NodeInput,
     TensorShape,
     as_node,
diff --git a/src/bindings/python/src/openvino/opset9/ops.py b/src/bindings/python/src/openvino/opset9/ops.py
index e2264845e058dc..a6d45cfd0be2cc 100644
--- a/src/bindings/python/src/openvino/opset9/ops.py
+++ b/src/bindings/python/src/openvino/opset9/ops.py
@@ -7,10 +7,10 @@
 from typing import Optional
 
 import numpy as np
-from openvino import Node
-from openvino.utils.node_factory import _get_node_factory
-from openvino.utils.decorators import nameable_op
-from openvino.utils.types import (
+from openvino.runtime import Node
+from openvino.runtime.opset_utils import _get_node_factory
+from openvino.runtime.utils.decorators import nameable_op
+from openvino.runtime.utils.types import (
     NodeInput,
     as_nodes,
     as_node,
diff --git a/src/bindings/python/src/openvino/preprocess/torchvision/preprocess_converter.py b/src/bindings/python/src/openvino/preprocess/torchvision/preprocess_converter.py
index 717e945217468c..c14635cc118208 100644
--- a/src/bindings/python/src/openvino/preprocess/torchvision/preprocess_converter.py
+++ b/src/bindings/python/src/openvino/preprocess/torchvision/preprocess_converter.py
@@ -5,7 +5,7 @@
 from typing import Callable, Any, Union
 import logging
 
-import openvino as ov
+import openvino.runtime as ov
 
 
 class PreprocessConverter():
diff --git a/src/bindings/python/src/openvino/preprocess/torchvision/torchvision_preprocessing.py b/src/bindings/python/src/openvino/preprocess/torchvision/torchvision_preprocessing.py
index 5dad42b47da44a..f8b51afd546f57 100644
--- a/src/bindings/python/src/openvino/preprocess/torchvision/torchvision_preprocessing.py
+++ b/src/bindings/python/src/openvino/preprocess/torchvision/torchvision_preprocessing.py
@@ -20,10 +20,10 @@
 import torchvision.transforms as transforms
 from torchvision.transforms import InterpolationMode
 
-import openvino as ov
-import openvino.opset11 as ops
-from openvino import Layout, Type
-from openvino.utils.decorators import custom_preprocess_function
+import openvino.runtime as ov
+import openvino.runtime.opset11 as ops
+from openvino.runtime import Layout, Type
+from openvino.runtime.utils.decorators import custom_preprocess_function
 from openvino.preprocess import PrePostProcessor, ResizeAlgorithm, ColorFormat
 
 
diff --git a/src/bindings/python/src/openvino/runtime/opset_utils.py b/src/bindings/python/src/openvino/runtime/opset_utils.py
new file mode 100644
index 00000000000000..475750e71f87c5
--- /dev/null
+++ b/src/bindings/python/src/openvino/runtime/opset_utils.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+import numpy as np
+
+from openvino.runtime import Node
+from openvino.runtime.utils.decorators import nameable_op
+from openvino.runtime.utils.node_factory import NodeFactory
+from openvino.runtime.utils.types import (
+    as_node,
+    NodeInput,
+)
+
+
+def _get_node_factory(opset_version: Optional[str] = None) -> NodeFactory:
+    """Return NodeFactory configured to create operators from specified opset version."""
+    if opset_version:
+        return NodeFactory(opset_version)
+    else:
+        return NodeFactory()
diff --git a/src/bindings/python/src/openvino/runtime/opset_utils/__init__.py b/src/bindings/python/src/openvino/runtime/opset_utils/__init__.py
deleted file mode 100644
index 6fb3e5f6f0c950..00000000000000
--- a/src/bindings/python/src/openvino/runtime/opset_utils/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-
-from openvino.utils.node_factory import _get_node_factory
diff --git a/src/bindings/python/src/openvino/runtime/utils/__init__.py b/src/bindings/python/src/openvino/runtime/utils/__init__.py
index 8447e93a907277..73399ccbed2598 100644
--- a/src/bindings/python/src/openvino/runtime/utils/__init__.py
+++ b/src/bindings/python/src/openvino/runtime/utils/__init__.py
@@ -4,4 +4,4 @@
 
 """Generic utilities. Factor related functions out to separate files."""
 
-from openvino.utils import numpy_to_c, replace_node, replace_output_update_name
+from openvino._pyopenvino.util import numpy_to_c, replace_node, replace_output_update_name
diff --git a/src/bindings/python/src/openvino/utils/broadcasting.py b/src/bindings/python/src/openvino/runtime/utils/broadcasting.py
similarity index 87%
rename from src/bindings/python/src/openvino/utils/broadcasting.py
rename to src/bindings/python/src/openvino/runtime/utils/broadcasting.py
index 01549625e2c628..9fd13da7728e29 100644
--- a/src/bindings/python/src/openvino/utils/broadcasting.py
+++ b/src/bindings/python/src/openvino/runtime/utils/broadcasting.py
@@ -3,11 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
-from typing import Optional
+from typing import List, Optional
 
-from openvino import AxisSet
-from openvino.utils.types import (
+from openvino.runtime import AxisSet, Node
+from openvino.runtime.utils.types import (
+    NodeInput,
     TensorShape,
+    get_dtype,
+    make_constant_node,
 )
 
 log = logging.getLogger(__name__)
diff --git a/src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py b/src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py
deleted file mode 100644
index 3219f239f0ab44..00000000000000
--- a/src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from openvino.utils.broadcasting import get_broadcast_axes
diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py
index 282547dd9df79a..a46105efaaeadb 100644
--- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py
+++ b/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py
@@ -2,7 +2,7 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from openvino.utils.data_helpers.data_dispatcher import _data_dispatch
-from openvino.utils.data_helpers.wrappers import tensor_from_file
-from openvino.utils.data_helpers.wrappers import _InferRequestWrapper
-from openvino.utils.data_helpers.wrappers import OVDict
+from openvino.runtime.utils.data_helpers.data_dispatcher import _data_dispatch
+from openvino.runtime.utils.data_helpers.wrappers import tensor_from_file
+from openvino.runtime.utils.data_helpers.wrappers import _InferRequestWrapper
+from openvino.runtime.utils.data_helpers.wrappers import OVDict
diff --git a/src/bindings/python/src/openvino/utils/data_helpers/data_dispatcher.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py
similarity index 99%
rename from src/bindings/python/src/openvino/utils/data_helpers/data_dispatcher.py
rename to src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py
index d4db7cb07b629c..bce10c9c3774ef 100644
--- a/src/bindings/python/src/openvino/utils/data_helpers/data_dispatcher.py
+++ b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from openvino._pyopenvino import ConstOutput, Tensor, Type, RemoteTensor
-from openvino.utils.data_helpers.wrappers import _InferRequestWrapper, OVDict
+from openvino.runtime.utils.data_helpers.wrappers import _InferRequestWrapper, OVDict
 
 ContainerTypes = Union[dict, list, tuple, OVDict]
 ScalarTypes = Union[np.number, int, float]
diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py
deleted file mode 100644
index e0a2d022660dd3..00000000000000
--- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-
-from openvino.utils.data_helpers.data_dispatcher import ContainerTypes
-from openvino.utils.data_helpers.data_dispatcher import ScalarTypes
-from openvino.utils.data_helpers.data_dispatcher import ValidKeys
-
-from openvino.utils.data_helpers.data_dispatcher import is_list_simple_type
-from openvino.utils.data_helpers.data_dispatcher import get_request_tensor
-from openvino.utils.data_helpers.data_dispatcher import value_to_tensor
-from openvino.utils.data_helpers.data_dispatcher import to_c_style
-from openvino.utils.data_helpers.data_dispatcher import normalize_arrays
-from openvino.utils.data_helpers.data_dispatcher import create_shared
-from openvino.utils.data_helpers.data_dispatcher import set_request_tensor
-from openvino.utils.data_helpers.data_dispatcher import update_tensor
-from openvino.utils.data_helpers.data_dispatcher import update_inputs
-from openvino.utils.data_helpers.data_dispatcher import create_copied
-from openvino.utils.data_helpers.data_dispatcher import _data_dispatch
diff --git a/src/bindings/python/src/openvino/utils/data_helpers/wrappers.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers.py
similarity index 100%
rename from src/bindings/python/src/openvino/utils/data_helpers/wrappers.py
rename to src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers.py
diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py
deleted file mode 100644
index 22214fd24682da..00000000000000
--- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-
-from openvino.utils.data_helpers.wrappers import tensor_from_file
-from openvino.utils.data_helpers.wrappers import _InferRequestWrapper
-from openvino.utils.data_helpers.wrappers import OVDict
diff --git a/src/bindings/python/src/openvino/utils/decorators.py b/src/bindings/python/src/openvino/runtime/utils/decorators.py
similarity index 98%
rename from src/bindings/python/src/openvino/utils/decorators.py
rename to src/bindings/python/src/openvino/runtime/utils/decorators.py
index 9418c359d129e8..98da1ba4389ef7 100644
--- a/src/bindings/python/src/openvino/utils/decorators.py
+++ b/src/bindings/python/src/openvino/runtime/utils/decorators.py
@@ -6,8 +6,8 @@
 from inspect import signature
 from typing import Any, Callable, Dict, Optional, Union, get_origin, get_args
 
-from openvino import Node, Output
-from openvino.utils.types import NodeInput, as_node, as_nodes
+from openvino.runtime import Node, Output
+from openvino.runtime.utils.types import NodeInput, as_node, as_nodes
 
 
 def _get_name(**kwargs: Any) -> Node:
diff --git a/src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py b/src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py
deleted file mode 100644
index bb0bac112d2c5f..00000000000000
--- a/src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from openvino.utils.decorators import _get_name
-from openvino.utils.decorators import _set_node_friendly_name
-from openvino.utils.decorators import nameable_op
-from openvino.utils.decorators import unary_op
-from openvino.utils.decorators import binary_op
-from openvino.utils.decorators import custom_preprocess_function
-from openvino.utils.decorators import MultiMethod
-from openvino.utils.decorators import registry
-from openvino.utils.decorators import overloading
diff --git a/src/bindings/python/src/openvino/utils/input_validation.py b/src/bindings/python/src/openvino/runtime/utils/input_validation.py
similarity index 98%
rename from src/bindings/python/src/openvino/utils/input_validation.py
rename to src/bindings/python/src/openvino/runtime/utils/input_validation.py
index 1de08452e1da9f..e79a16c48581b1 100644
--- a/src/bindings/python/src/openvino/utils/input_validation.py
+++ b/src/bindings/python/src/openvino/runtime/utils/input_validation.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 
-from openvino.exceptions import UserInputError
+from openvino.runtime.exceptions import UserInputError
 
 log = logging.getLogger(__name__)
 
diff --git a/src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py b/src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py
deleted file mode 100644
index 0b49e9ea33c40d..00000000000000
--- a/src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from openvino.utils.input_validation import assert_list_of_ints
-from openvino.utils.input_validation import _check_value
-from openvino.utils.input_validation import check_valid_attribute
-from openvino.utils.input_validation import check_valid_attributes
-from openvino.utils.input_validation import is_positive_value
-from openvino.utils.input_validation import is_non_negative_value
diff --git a/src/bindings/python/src/openvino/utils/node_factory.py b/src/bindings/python/src/openvino/runtime/utils/node_factory.py
similarity index 92%
rename from src/bindings/python/src/openvino/utils/node_factory.py
rename to src/bindings/python/src/openvino/runtime/utils/node_factory.py
index e999ae6988814a..25daf739223dba 100644
--- a/src/bindings/python/src/openvino/utils/node_factory.py
+++ b/src/bindings/python/src/openvino/runtime/utils/node_factory.py
@@ -2,16 +2,17 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import logging as log
 
-from functools import singledispatchmethod
+from functools import partial, singledispatchmethod
 from typing import Any, Dict, List, Optional, Union
 from pathlib import Path
 
 from openvino._pyopenvino import NodeFactory as _NodeFactory
 
-from openvino import Node, Output, Extension
+from openvino.runtime import Node, Output, Extension
 
-from openvino.exceptions import UserInputError
+from openvino.runtime.exceptions import UserInputError
 
 DEFAULT_OPSET = "opset13"
 
@@ -124,11 +125,3 @@ def _arguments_as_outputs(arguments: List[Union[Node, Output]]) -> List[Output]:
             else:
                 outputs.extend(argument.outputs())
         return outputs
-
-
-def _get_node_factory(opset_version: Optional[str] = None) -> NodeFactory:
-    """Return NodeFactory configured to create operators from specified opset version."""
-    if opset_version:
-        return NodeFactory(opset_version)
-    else:
-        return NodeFactory()
diff --git a/src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py b/src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py
deleted file mode 100644
index 945ea8deb7863c..00000000000000
--- a/src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from openvino.utils.node_factory import NodeFactory
diff --git a/src/bindings/python/src/openvino/utils/reduction.py b/src/bindings/python/src/openvino/runtime/utils/reduction.py
similarity index 95%
rename from src/bindings/python/src/openvino/utils/reduction.py
rename to src/bindings/python/src/openvino/runtime/utils/reduction.py
index e6be6d0ac9a104..71d0af8de7376e 100644
--- a/src/bindings/python/src/openvino/utils/reduction.py
+++ b/src/bindings/python/src/openvino/runtime/utils/reduction.py
@@ -4,7 +4,7 @@
 
 from typing import Iterable, Optional
 
-from openvino import Node
+from openvino.runtime import Node
 
 
 def get_reduction_axes(node: Node, reduction_axes: Optional[Iterable[int]]) -> Iterable[int]:
diff --git a/src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py b/src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py
deleted file mode 100644
index a2fbff9e793dca..00000000000000
--- a/src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from openvino.utils.reduction import get_reduction_axes
diff --git a/src/bindings/python/src/openvino/utils/types.py b/src/bindings/python/src/openvino/runtime/utils/types.py
similarity index 97%
rename from src/bindings/python/src/openvino/utils/types.py
rename to src/bindings/python/src/openvino/runtime/utils/types.py
index b3543739741d94..52f1faf8e1e839 100644
--- a/src/bindings/python/src/openvino/utils/types.py
+++ b/src/bindings/python/src/openvino/runtime/utils/types.py
@@ -9,9 +9,9 @@
 
 import numpy as np
 
-from openvino.exceptions import OVTypeError
-from openvino import Node, Shape, Output, Type
-from openvino.op import Constant
+from openvino.runtime.exceptions import OVTypeError
+from openvino.runtime import Node, Shape, Output, Type
+from openvino.runtime.op import Constant
 
 log = logging.getLogger(__name__)
 
diff --git a/src/bindings/python/src/openvino/runtime/utils/types/__init__.py b/src/bindings/python/src/openvino/runtime/utils/types/__init__.py
deleted file mode 100644
index 4f88d609988e8d..00000000000000
--- a/src/bindings/python/src/openvino/runtime/utils/types/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from openvino.utils.types import TensorShape
-from openvino.utils.types import NumericData
-from openvino.utils.types import NumericType
-from openvino.utils.types import ScalarData
-from openvino.utils.types import NodeInput
-
-from openvino.utils.types import openvino_to_numpy_types_map
-from openvino.utils.types import openvino_to_numpy_types_str_map
-from openvino.utils.types import get_element_type
-from openvino.utils.types import get_element_type_str
-from openvino.utils.types import get_dtype
-from openvino.utils.types import get_numpy_ctype
-from openvino.utils.types import get_ndarray
-from openvino.utils.types import get_shape
-from openvino.utils.types import make_constant_node
-from openvino.utils.types import as_node
-from openvino.utils.types import as_nodes
diff --git a/src/bindings/python/src/openvino/package_utils.py b/src/bindings/python/src/openvino/utils.py
similarity index 97%
rename from src/bindings/python/src/openvino/package_utils.py
rename to src/bindings/python/src/openvino/utils.py
index 6aa3f3ed39b556..9890ae9b3e6460 100644
--- a/src/bindings/python/src/openvino/package_utils.py
+++ b/src/bindings/python/src/openvino/utils.py
@@ -21,9 +21,9 @@ def _add_openvino_libs_to_search_path() -> None:
         if os.path.isdir(os.path.join(os.path.dirname(__file__), "libs")):
             # looking for the libs in the pip installation path.
             openvino_libs.append(os.path.join(os.path.dirname(__file__), "libs"))
-        elif os.path.isdir(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "Library", "bin")):
+        elif os.path.isdir(os.path.join(os.path.dirname(__file__), "..", "..", "..", "Library", "bin")):
             # looking for the libs in the conda installation path
-            openvino_libs.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "Library", "bin"))
+            openvino_libs.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "Library", "bin"))
         else:
             # setupvars.bat script set all libs paths to OPENVINO_LIB_PATHS environment variable.
             openvino_libs_installer = os.getenv("OPENVINO_LIB_PATHS")
diff --git a/src/bindings/python/src/openvino/utils/__init__.py b/src/bindings/python/src/openvino/utils/__init__.py
deleted file mode 100644
index 2ccc79d20cce84..00000000000000
--- a/src/bindings/python/src/openvino/utils/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-"""Generic utilities. Factor related functions out to separate files."""
-
-from openvino._pyopenvino.util import numpy_to_c, replace_node, replace_output_update_name
-
-from openvino.package_utils import get_cmake_path
-from openvino.package_utils import deprecated
-from openvino.package_utils import classproperty
-from openvino.package_utils import deprecatedclassproperty
diff --git a/src/bindings/python/src/openvino/utils/data_helpers/__init__.py b/src/bindings/python/src/openvino/utils/data_helpers/__init__.py
deleted file mode 100644
index 282547dd9df79a..00000000000000
--- a/src/bindings/python/src/openvino/utils/data_helpers/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from openvino.utils.data_helpers.data_dispatcher import _data_dispatch
-from openvino.utils.data_helpers.wrappers import tensor_from_file
-from openvino.utils.data_helpers.wrappers import _InferRequestWrapper
-from openvino.utils.data_helpers.wrappers import OVDict
diff --git a/src/common/transformations/include/transformations/common_optimizations/sdpa_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/sdpa_fusion.hpp
new file mode 100644
index 00000000000000..84383b777604ea
--- /dev/null
+++ b/src/common/transformations/include/transformations/common_optimizations/sdpa_fusion.hpp
@@ -0,0 +1,60 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+/// This pass transforms the following sub-graph to a single Scaled Dot Product Attention operation.
+/// Before:
+///     ┌───────┐     ┌───────┐    ┌───────┐
+///     │   Q   │     │   K   │    │   V   │
+///     └───┬───┘     └───┬───┘    └───┬───┘
+///         │             │            │
+///         │             │            │
+///     ┌───┴───┐   ┌─────┴──────┐     │
+///     │ MatMul│<──│ Transpose  │     │
+///     └───┬───┘   | (Optional) │     │
+///         │       └────────────┘     │
+///     ┌───┴───┐    ┌─────────────┐   │
+///     │  Add  │<───│AttentionMask│   │
+///     └───┬───┘    | (Optional)  │   │
+///         │        └─────────────┘   │
+///     ┌───┴───┐                      │
+///     │Softmax│                      │
+///     └───┬───┘                      │
+///         │                          │
+///     ┌───┴───┐                      │
+///     │ MatMul│<─────────────────────┘
+///     └───┬───┘
+///     ┌───┴───┐
+///     │ Output│
+///     └───────┘
+///
+/// After:
+///     ┌───────┐    ┌───────┐    ┌───────┐    ┌─────────────┐
+///     │   Q   │    │   K   │    │   V   │    │AttentionMask│
+///     └───┬───┘    └───┬───┘    └───┬───┘    └──────┬──────┘
+///         │            │            │               │
+///         │            │            │               │
+///     ┌───┴────────────┴────────────┴───────────────┴─┐
+///     │           ScaledDotProductAttention           │
+///     └────────────────────┬──────────────────────────┘
+///                          │
+///                          │
+///                     ┌────┴────┐
+///                     │  Output │
+///                     └─────────┘
+class TRANSFORMATIONS_API SDPAFusion : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("SDPAFusion", "0");
+    SDPAFusion();
+};
+
+}  // namespace pass
+}  // namespace ov
diff --git a/src/common/transformations/include/transformations/common_optimizations/sdpa_scale_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/sdpa_scale_fusion.hpp
new file mode 100644
index 00000000000000..cae0363e785f4e
--- /dev/null
+++ b/src/common/transformations/include/transformations/common_optimizations/sdpa_scale_fusion.hpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+/// Merges explicit multiplication by scalar value for Q and K into scale attribute of SDPA op
+/// Before:
+///     ┌───────┐    ┌───────┐    ┌───────┐  ┌─────────────┐     ┌─────────────┐
+///     │   Q   │    │   K   │    │   V   │  │AttentionMask│     │    Scale    |
+///     └───┬───┘    └───┬───┘    └───┬───┘  │ (Optional)  │     │  (Optional) │
+///         │            │            │      └──────┬──────┘     └───────┬─────┘
+///         │            │            │             │                    |
+///     ┌───┴───┐    ┌───┴───┐        │             │                    |
+///     │  Mul  |    │  Mul  │        |             │                    |
+///     └───┬───┘    └───┬───┘        │             │                    │
+///         │            │            │             │                    │
+///         |            │            │             │                    │
+///     ┌───┴────────────┴────────────┴─────────────┴─┐                  |
+///     │           ScaledDotProductAttention         │──────────────────┘
+///     └────────────────────┬────────────────────────┘
+///                          │
+///                          │
+///                     ┌────┴────┐
+///                     │  Output │
+///                     └─────────┘
+/// After:
+///     ┌───────┐    ┌───────┐    ┌───────┐  ┌─────────────┐  ┌───────┐
+///     │   Q   │    │   K   │    │   V   │  │AttentionMask│  │ Scale |
+///     └───┬───┘    └───┬───┘    └───┬───┘  └──────┬──────┘  └───┬───┘
+///         │            │            │             │             |
+///         │            │            │             │             |
+///         |            │            │             │             |
+///     ┌───┴────────────┴────────────┴─────────────┴─┐           |
+///     │           ScaledDotProductAttention         │───────────┘
+///     └────────────────────┬────────────────────────┘
+///                          │
+///                          │
+///                     ┌────┴────┐
+///                     │  Output │
+///                     └─────────┘
+/// Multiply ops for Q and K are eliminated in the following cases:
+/// 1. Q_scale and K_scale are constant
+/// 2. Q_scale * SDPA_Scale == 1 or K_scale * SDPA_Scale == 1
+class TRANSFORMATIONS_API SDPAScaleFusion : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("SDPAScaleFusion", "0");
+    SDPAScaleFusion();
+};
+
+}  // namespace pass
+}  // namespace ov
diff --git a/src/common/transformations/include/transformations/sdpa_to_paged_attention/position_ids_replacer.hpp b/src/common/transformations/include/transformations/sdpa_to_paged_attention/position_ids_replacer.hpp
index 50c0ecd20e76af..825ce8acbd7998 100644
--- a/src/common/transformations/include/transformations/sdpa_to_paged_attention/position_ids_replacer.hpp
+++ b/src/common/transformations/include/transformations/sdpa_to_paged_attention/position_ids_replacer.hpp
@@ -15,6 +15,7 @@ namespace ov {
 namespace pass {
 
 class TRANSFORMATIONS_API PositionIDsReplacer;
+class TRANSFORMATIONS_API PositionIDsReplacerQwen;
 
 }  // namespace pass
 }  // namespace ov
@@ -24,3 +25,22 @@ class ov::pass::PositionIDsReplacer : public ov::pass::MatcherPass {
     OPENVINO_MATCHER_PASS_RTTI("PositionIDsReplacer");
     explicit PositionIDsReplacer(const Output<Node>& position_ids);
 };
+
+/**
+ * @brief Qwen model expects data processing in order, the "position ids" input is detached and
+ * is not explicitly used in the model. The model uses implicitly defined "position ids" based
+ * on the past KV cache size.
+ *
+ * To use this model in Continuous batching mode, we need to apply position_ids and
+ * use the corresponding rotary_emb_cos/rotary_emb_sin.
+ * For this, we replace
+ *      rotary_emb_cos/rotary_emb_sin -> Slice -> Slice
+ * With
+ *      rotary_emb_cos/rotary_emb_sin -> Gather(by position_ids)
+ * Which enables applying RoPE for each token independently of their order in the input tensor.
+ */
+class ov::pass::PositionIDsReplacerQwen : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("PositionIDsReplacerQwen");
+    explicit PositionIDsReplacerQwen(const Output<Node>& position_ids);
+};
diff --git a/src/common/transformations/include/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp b/src/common/transformations/include/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp
index f5497207eb4e17..d1cc5d5126cd67 100644
--- a/src/common/transformations/include/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp
+++ b/src/common/transformations/include/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp
@@ -4,7 +4,6 @@
 
 #pragma once
 
-#include "openvino/cc/pass/itt.hpp"
 #include "openvino/op/shape_of.hpp"
 #include "openvino/op/subtract.hpp"
 #include "openvino/pass/matcher_pass.hpp"
@@ -22,6 +21,8 @@ class TRANSFORMATIONS_API PrevSequenceLengthPattern;
 
 class ov::pass::PrevSequenceLengthPattern : public ov::pass::MatcherPass {
 public:
-    OPENVINO_MATCHER_PASS_RTTI("PrevSequenceLengthPattern");
-    explicit PrevSequenceLengthPattern(std::shared_ptr<ov::Node> prev_max_seq_len, std::shared_ptr<ov::Node> batch_dim);
+    OPENVINO_MATCHER_PASS_RTTI("PrevSequenceLengthPattern", "0");
+    explicit PrevSequenceLengthPattern(const std::shared_ptr<ov::Node>& unsqueezed_input_ids,
+                                       const std::shared_ptr<ov::Node>& max_context_len,
+                                       const std::shared_ptr<ov::Node>& position_ids);
 };
diff --git a/src/common/transformations/include/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp b/src/common/transformations/include/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp
index b5ecb96fa95198..2456161ea80a78 100644
--- a/src/common/transformations/include/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp
+++ b/src/common/transformations/include/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp
@@ -15,6 +15,7 @@ namespace ov {
 namespace pass {
 
 class TRANSFORMATIONS_API TotalSequenceLengthPattern;
+class TRANSFORMATIONS_API TotalSequenceLengthPatternQwen;
 
 }  // namespace pass
 }  // namespace ov
@@ -24,3 +25,22 @@ class ov::pass::TotalSequenceLengthPattern : public ov::pass::MatcherPass {
     OPENVINO_MATCHER_PASS_RTTI("TotalSequenceLengthPattern");
     explicit TotalSequenceLengthPattern(const std::shared_ptr<ov::op::v0::Parameter>& max_context_len);
 };
+
+/**
+ * @brief Qwen model has a specific pattern for TotalSequenceLen place detection.
+ *
+ * common pattern: Add (PrevSeqLen, CurrentSeqLen)
+ *
+ * The CurrentSeqLen is presented in this form:
+ * CurrentSeqLen: Parameter(name: input_ids) -> ShapeOf -> Gather
+ *
+ * Before applying this transformation, we already detected the PrevSeqLen place in the PrevSequenceLengthPattern
+ * and replaced it with the next subgraph:
+ * PrevSeqLen: Subtract (in: Parameter(name: max_context_len), in: CurrentSeqLen)
+ *
+ **/
+class ov::pass::TotalSequenceLengthPatternQwen : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("TotalSequenceLengthPattern", "0");
+    explicit TotalSequenceLengthPatternQwen(const std::shared_ptr<ov::op::v0::Parameter>& max_context_len);
+};
diff --git a/src/common/transformations/include/transformations/utils/gen_pattern.hpp b/src/common/transformations/include/transformations/utils/gen_pattern.hpp
index 21309e339c959c..976561b4844a17 100644
--- a/src/common/transformations/include/transformations/utils/gen_pattern.hpp
+++ b/src/common/transformations/include/transformations/utils/gen_pattern.hpp
@@ -539,6 +539,11 @@ class AttrSetter : public ov::AttributeVisitor {
             a->set(m_attr_map[name].as_vector<int64_t>());
         } else if (auto a = ov::as_type<ov::AttributeAdapter<ov::element::TypeVector>>(&adapter)) {
             a->set(m_attr_map[name].as_T_vector<ov::element::Type>());
+        } else if (auto a = dynamic_cast<ov::AttributeAdapter<std::shared_ptr<ov::op::util::Variable>>*>(&adapter)) {
+            ov::op::util::VariableInfo var_info;
+            var_info.variable_id = m_attr_map[name].as_string();
+            auto variable = std::make_shared<ov::op::util::Variable>(var_info);
+            a->set(variable);
         } else {
             OPENVINO_THROW("unsupported AttributeAdapter for attribute : ", name);
         }
@@ -896,6 +901,7 @@ struct PatternNode {
     // scalar constant (treated as wildcard for single-element-constant with any rank)
     PatternNode(int v) : node(std::make_shared<ov::op::v0::Constant>(element::from<int>(), Shape({}), v)) {}
     PatternNode(float v) : node(std::make_shared<ov::op::v0::Constant>(element::from<float>(), Shape({}), v)) {}
+    PatternNode(long long v) : node(std::make_shared<ov::op::v0::Constant>(element::from<int64_t>(), Shape({}), v)) {}
 
     PatternNode(std::initializer_list<int> v, values_info vi = nullptr) {
         node = ConstVector(std::vector<int>(v), vi);
diff --git a/src/common/transformations/include/transformations/utils/print_model.hpp b/src/common/transformations/include/transformations/utils/print_model.hpp
index 0829cd7e320e88..53fa7de51c5eca 100644
--- a/src/common/transformations/include/transformations/utils/print_model.hpp
+++ b/src/common/transformations/include/transformations/utils/print_model.hpp
@@ -19,6 +19,7 @@
 #include "openvino/core/model.hpp"
 #include "openvino/core/node.hpp"
 #include "openvino/op/constant.hpp"
+#include "openvino/op/util/multi_subgraph_base.hpp"
 #include "openvino/pass/pass.hpp"
 #include "transformations/utils/utils.hpp"
 
diff --git a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp
index 185ae84ec83642..23fbf882024bdc 100644
--- a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp
@@ -65,6 +65,7 @@
 #include "transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.hpp"
 #include "transformations/common_optimizations/reshape_sequence_fusion.hpp"
 #include "transformations/common_optimizations/ric_fusion.hpp"
+#include "transformations/common_optimizations/sdpa_fusion.hpp"
 #include "transformations/common_optimizations/select_with_one_value_condition.hpp"
 #include "transformations/common_optimizations/sequence_fusion.hpp"
 #include "transformations/common_optimizations/shared_ops_optimization.hpp"
@@ -229,6 +230,7 @@ bool ov::pass::MOCTransformations::run_on_model(const std::shared_ptr<ov::Model>
     ADD_MATCHER(common_fusions, ConvertTensorIteratorToSequence)
     ADD_MATCHER(common_fusions, SplitConcatPairToInterpolateFusion, m_use_shapes)
     ADD_MATCHER(common_fusions, ConvolutionToGroupConvolutionFusion)
+    ADD_MATCHER(common_fusions, SDPAFusion)
     if (m_use_shapes) {
         ADD_MATCHER(common_fusions, NearestNeighborUpsamplingFusion)
     }
diff --git a/src/common/transformations/src/transformations/common_optimizations/sdpa_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/sdpa_fusion.cpp
new file mode 100644
index 00000000000000..fc581580f70001
--- /dev/null
+++ b/src/common/transformations/src/transformations/common_optimizations/sdpa_fusion.cpp
@@ -0,0 +1,127 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/common_optimizations/sdpa_fusion.hpp"
+
+#include "openvino/core/rt_info.hpp"
+#include "openvino/core/type.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+#include "openvino/op/softmax.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/op/unsqueeze.hpp"
+#include "openvino/pass/pattern/op/optional.hpp"
+#include "openvino/pass/pattern/op/pattern.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "transformations/utils/gen_pattern.hpp"
+
+namespace ov {
+namespace pass {
+
+SDPAFusion::SDPAFusion() {
+    using namespace ov::pass::pattern;
+    using namespace ov::gen_pattern;
+
+    auto q = makePattern(ov::Rank(4));
+    auto k = makePattern(ov::Rank(4));
+    auto v = makePattern(ov::Rank(4));
+    auto mask = makePattern();
+
+    auto k_transpose_order = pattern::wrap_type<ov::op::v0::Constant>([](const Output<Node>& node) {
+        auto axis_order =
+            std::dynamic_pointer_cast<ov::op::v0::Constant>(node.get_node_shared_ptr())->cast_vector<int64_t>();
+        return axis_order == std::vector<int64_t>{0, 1, 3, 2};
+    });
+
+    auto k_t = pattern::wrap_type<ov::op::v1::Transpose>({k, k_transpose_order});
+    auto qk_nn = makePattern<ov::op::v0::MatMul>({q, k_t}, {{"transpose_a", false}, {"transpose_b", false}});
+    auto qk_nt = makePattern<ov::op::v0::MatMul>({q, k}, {{"transpose_a", false}, {"transpose_b", true}});
+    auto qk = qk_nt | qk_nn;
+    auto optional_add_mask = optional<ov::op::v1::Add>({qk, mask});
+    auto softmax = makePattern<ov::op::v8::Softmax>({optional_add_mask}, {{"axis", "-1"}});
+    auto qkv = makePattern<ov::op::v0::MatMul>({softmax, v}, {{"transpose_a", false}, {"transpose_b", false}});
+
+    auto valid_qk_shapes = [](const std::shared_ptr<ov::op::v0::MatMul>& qk_matmul) {
+        auto q_pshape = qk_matmul->get_input_partial_shape(0);
+        auto k_pshape = qk_matmul->get_input_partial_shape(1);
+
+        const size_t q_head_size_idx = 3;
+        const size_t k_head_size_idx = qk_matmul->get_transpose_b() ? 3 : 2;
+
+        return q_pshape.size() == 4 && k_pshape.size() == 4 && q_pshape[q_head_size_idx].is_static() &&
+               k_pshape[k_head_size_idx].is_static() &&
+               q_pshape[q_head_size_idx].get_length() == k_pshape[k_head_size_idx].get_length();
+    };
+
+    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        if (transformation_callback(m.get_match_root())) {
+            return false;
+        }
+
+        auto q_node = pattern_map.at(q);
+        auto k_node = pattern_map.at(k);
+        auto v_node = pattern_map.at(v);
+
+        if (!valid_qk_shapes(ov::as_type_ptr<ov::op::v0::MatMul>(pattern_map.at(qk).get_node_shared_ptr()))) {
+            return false;
+        }
+
+        if (pattern_map.at(qk).get_target_inputs().size() > 1 ||
+            pattern_map.at(softmax).get_target_inputs().size() > 1) {
+            return false;
+        }
+        if (pattern_map.count(optional_add_mask) && (pattern_map.at(optional_add_mask).get_target_inputs().size() > 1 ||
+                                                     pattern_map.at(mask).get_partial_shape().size() > 4)) {
+            return false;
+        }
+
+        Output<ov::Node> mask_value;
+        Output<ov::Node> mask_input;
+        if (pattern_map.find(optional_add_mask) != pattern_map.end()) {
+            mask_value = pattern_map.at(mask);
+        } else {
+            mask_value = ov::op::v0::Constant::create(q_node.get_element_type(), ov::Shape{}, std::vector<float>{0});
+        }
+
+        if (mask_value.get_partial_shape().size() > 4) {
+            return false;
+        }
+
+        if (mask_value.get_partial_shape().rank() == 0 || mask_value.get_partial_shape().rank() == 4) {
+            mask_input = mask_value;
+        } else {
+            size_t rank_diff = q_node.get_partial_shape().size() - mask_value.get_partial_shape().size();
+            std::vector<int64_t> axes(rank_diff);
+            std::iota(axes.begin(), axes.end(), 0);
+            mask_input = std::make_shared<ov::op::v0::Unsqueeze>(
+                mask_value,
+                ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank_diff}, axes));
+        }
+
+        std::shared_ptr<ov::Node> scale_node =
+            ov::op::v0::Constant::create(q_node.get_element_type(), ov::Shape{}, std::vector<float>{1.0f});
+
+        std::shared_ptr<ov::Node> sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_node,
+                                                                                                  k_node,
+                                                                                                  v_node,
+                                                                                                  mask_input,
+                                                                                                  scale_node,
+                                                                                                  false);
+
+        sdpa->set_friendly_name(m.get_match_root()->get_friendly_name());
+        ov::copy_runtime_info(m.get_matched_nodes(), sdpa);
+        ov::replace_node(m.get_match_root(), sdpa);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(qkv, "SDPAFusion");
+    this->register_matcher(m, callback);
+}
+
+}  // namespace pass
+}  // namespace ov
diff --git a/src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp
new file mode 100644
index 00000000000000..3d750fe38a868e
--- /dev/null
+++ b/src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp
@@ -0,0 +1,140 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/common_optimizations/sdpa_scale_fusion.hpp"
+
+#include <memory>
+
+#include "openvino/core/node.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "openvino/core/type.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+#include "openvino/pass/pattern/op/optional.hpp"
+#include "openvino/pass/pattern/op/pattern.hpp"
+#include "transformations/utils/gen_pattern.hpp"
+
+namespace ov {
+namespace pass {
+
+SDPAScaleFusion::SDPAScaleFusion() {
+    using namespace ov::pass::pattern;
+    using namespace ov::gen_pattern;
+
+    auto q = makePattern(ov::Rank(4));
+    auto k = makePattern(ov::Rank(4));
+    auto v = makePattern(ov::Rank(4));
+    auto mask = makePattern();
+    auto sdpa_scale = makeConst({});
+    auto scale_q = makePattern("[]") | makePattern("[1]");
+    auto scale_k = makePattern("[]") | makePattern("[1]");
+
+    auto scaled_q = optional<ov::op::v1::Multiply>({q, scale_q});
+    auto scaled_k = optional<ov::op::v1::Multiply>({k, scale_k});
+    auto sdpa_mask_scale =
+        makePattern<ov::op::v13::ScaledDotProductAttention>({scaled_q, scaled_k, v, mask, sdpa_scale},
+                                                            {{"causal", false}});
+    auto sdpa_mask =
+        makePattern<ov::op::v13::ScaledDotProductAttention>({scaled_q, scaled_k, v, mask}, {{"causal", false}});
+    auto sdpa_simple =
+        makePattern<ov::op::v13::ScaledDotProductAttention>({scaled_q, scaled_k, v}, {{"causal", false}});
+    auto sdpa = sdpa_simple | sdpa_mask | sdpa_mask_scale;
+
+    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        if (transformation_callback(m.get_match_root())) {
+            return false;
+        }
+
+        auto sdpa = m.get_match_root();
+
+        const bool has_q_scale = pattern_map.count(scaled_q);
+        const bool has_k_scale = pattern_map.count(scaled_k);
+
+        // Nothing to do
+        if (!has_q_scale && !has_k_scale)
+            return false;
+
+        auto prev_scale_value = 1.0f;
+        auto scale_q_value = 1.0f;
+        auto scale_k_value = 1.0f;
+        auto scale_et = sdpa->get_output_element_type(0);
+
+        Output<ov::Node> q_input = sdpa->get_input_source_output(0);
+        Output<ov::Node> k_input = sdpa->get_input_source_output(1);
+
+        std::shared_ptr<ov::Node> scale_q_node = nullptr;
+        std::shared_ptr<ov::Node> scale_k_node = nullptr;
+
+        if (pattern_map.find(sdpa_scale) != pattern_map.end()) {
+            auto prev_scale_node =
+                ov::as_type_ptr<ov::op::v0::Constant>(pattern_map.at(sdpa_scale).get_node_shared_ptr());
+            prev_scale_value = prev_scale_node->cast_vector<float>()[0];
+            scale_et = prev_scale_node->get_output_element_type(0);
+        } else {
+            auto head_size = q_input.get_partial_shape()[3];
+            if (head_size.is_dynamic())
+                return false;
+
+            prev_scale_value = 1.0f / std::sqrt(static_cast<float>(head_size.get_length()));
+        }
+
+        // Extract scalar scale values for Q and K if those are constant and set new inputs for SDPA
+        if (has_q_scale) {
+            scale_q_node = pattern_map.at(scale_q).get_node_shared_ptr();
+            if (ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
+                scale_q_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_q_node)->cast_vector<float>()[0];
+                q_input = pattern_map.at(q);
+            }
+        }
+        if (has_k_scale) {
+            scale_k_node = pattern_map.at(scale_k).get_node_shared_ptr();
+            if (ov::is_type<ov::op::v0::Constant>(scale_k_node)) {
+                scale_k_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_k_node)->cast_vector<float>()[0];
+                k_input = pattern_map.at(k);
+            }
+        }
+
+        Output<ov::Node> new_scale_node;
+        auto new_scale_val = prev_scale_value * scale_q_value * scale_k_value;
+
+        // If new scale is 1 and we have non-constant scale node for either Q or K, then we can make it a scale of SDPA
+        if (new_scale_val == 1.0f) {
+            if (has_q_scale && !ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
+                new_scale_node = pattern_map.at(scale_q);
+                q_input = pattern_map.at(q);
+            } else if (has_k_scale && !ov::is_type<ov::op::v0::Constant>(scale_k_node)) {
+                new_scale_node = pattern_map.at(scale_k);
+                k_input = pattern_map.at(k);
+            } else {
+                new_scale_node = ov::op::v0::Constant::create(scale_et, ov::Shape{}, std::vector<float>{new_scale_val});
+            }
+        } else {
+            new_scale_node = ov::op::v0::Constant::create(scale_et, ov::Shape{}, std::vector<float>{new_scale_val});
+        }
+
+        OutputVector new_inputs = {q_input, k_input, pattern_map.at(v)};
+        if (pattern_map.find(mask) != pattern_map.end()) {
+            new_inputs.push_back(pattern_map.at(mask));
+        } else {
+            new_inputs.push_back(
+                ov::op::v0::Constant::create(new_scale_node.get_element_type(), ov::Shape{}, std::vector<float>{0.0f}));
+        }
+
+        new_inputs.push_back(new_scale_node);
+
+        auto new_sdpa = sdpa->clone_with_new_inputs(new_inputs);
+        new_sdpa->set_friendly_name(sdpa->get_friendly_name());
+        ov::copy_runtime_info(sdpa, new_sdpa);
+        ov::replace_node(sdpa, new_sdpa);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(sdpa, "SDPAScaleFusion");
+    this->register_matcher(m, callback);
+}
+
+}  // namespace pass
+}  // namespace ov
diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/position_ids_replacer.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/position_ids_replacer.cpp
index a72a49fb4832eb..397746c75bb84d 100644
--- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/position_ids_replacer.cpp
+++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/position_ids_replacer.cpp
@@ -7,11 +7,18 @@
 #include "openvino/cc/pass/itt.hpp"
 #include "openvino/op/gather.hpp"
 #include "openvino/op/matmul.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/shape_of.hpp"
+#include "openvino/op/slice.hpp"
+#include "openvino/op/squeeze.hpp"
+#include "openvino/op/unsqueeze.hpp"
 #include "openvino/pass/pattern/op/optional.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "transformations/utils/utils.hpp"
 
 using namespace ov::op;
+using namespace ov::pass::pattern;
 
 // TODO: Instead of using the following transformation that matches quite a specific place in a model graph in case when
 // position_ids parameter is missing, consider replacing always existing attention_mask parameter with a sub-graph using
@@ -19,25 +26,90 @@ using namespace ov::op;
 ov::pass::PositionIDsReplacer::PositionIDsReplacer(const Output<Node>& position_ids) {
     MATCHER_SCOPE(PositionIDsReplacer);
 
-    auto input_ids = pattern::any_input();
-    auto input_embed = pattern::wrap_type<v8::Gather>({pattern::any_input(), input_ids, pattern::any_input()});
+    auto input_ids = any_input();
+    auto input_embed = wrap_type<v8::Gather>({any_input(), input_ids, any_input()});
 
-    auto position_ids_pattern = pattern::any_input();
-    auto offset = pattern::wrap_type<v0::Constant>();
-    auto add_offset = pattern::wrap_type<v1::Add>({position_ids_pattern, offset});
-    auto convert = pattern::wrap_type<v0::Convert>({add_offset});
-    auto position_embed = pattern::wrap_type<v8::Gather>({pattern::any_input(), convert, pattern::any_input()});
+    auto position_ids_pattern = any_input();
+    auto offset = wrap_type<v0::Constant>();
+    auto add_offset = wrap_type<v1::Add>({position_ids_pattern, offset});
+    auto convert = wrap_type<v0::Convert>({add_offset});
+    auto position_embed = wrap_type<v8::Gather>({any_input(), convert, any_input()});
 
-    auto mul = pattern::optional<v0::MatMul>({input_embed, pattern::any_input()});
+    auto mul = optional<v0::MatMul>({input_embed, any_input()});
 
-    auto add = pattern::wrap_type<v1::Add>({mul, position_embed});
+    auto add = wrap_type<v1::Add>({mul, position_embed});
 
-    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+    ov::matcher_pass_callback callback = [=](Matcher& m) {
         const auto& pattern_map = m.get_pattern_value_map();
         replace_node(pattern_map.at(position_ids_pattern).get_node_shared_ptr(), position_ids.get_node_shared_ptr());
         return true;
     };
 
-    auto m = std::make_shared<ov::pass::pattern::Matcher>(add, matcher_name);
+    auto m = std::make_shared<Matcher>(add, matcher_name);
     register_matcher(m, callback);
-}
\ No newline at end of file
+}
+
+ov::pass::PositionIDsReplacerQwen::PositionIDsReplacerQwen(const Output<Node>& position_ids) {
+    MATCHER_SCOPE(PositionIDsReplacerQwen);
+
+    auto _const = []() {
+        return wrap_type<v0::Constant>();
+    };
+
+    // total seq len:
+    auto p_max_context_len = wrap_type<v0::Parameter>();
+    auto p_opt_convert = optional<v0::Convert>(p_max_context_len);
+    auto p_opt_reshape = optional<v1::Reshape>({p_opt_convert, any_input()});
+
+    // current seq len:
+    // it might be present in 2 different ways:
+    // input_ids -> unsqueeze -> reshape -> convert -> shape_of -> gather
+    // QKV -> variadic_split(Q or K) -> rope Q/K -> shape_of -> gather
+    // Probably we can use the symbols to re-use one of these ways.
+    // Currently, "any_input" is used to detect the both places.
+    auto p_shape_of = wrap_type<v3::ShapeOf>({any_input()});
+    auto p_current_len = wrap_type<v8::Gather>({p_shape_of, _const(), _const()});
+
+    auto p_neg_const = wrap_type<v0::Constant>();
+    auto p_neg_mul = wrap_type<v1::Multiply>({p_current_len, p_neg_const});
+    // the rotary_emb_cos/rotary_emb_sin are sliced by the total length [1,..4096,1,128]
+    auto p_rotary_emb_sincos = wrap_type<v0::Constant>();
+    auto p_slice_1 = wrap_type<v8::Slice>({p_rotary_emb_sincos, _const(), p_opt_reshape, _const(), _const()});
+    auto p_slice_2 = wrap_type<v8::Slice>({p_slice_1, p_neg_mul, _const(), _const(), _const()});
+
+    ov::matcher_pass_callback callback = [=](Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto max_context_len = pattern_map.at(p_max_context_len).get_node_shared_ptr();
+        if (max_context_len->get_friendly_name() != "max_context_len") {
+            return false;
+        }
+        auto rotary_emb_sincos = pattern_map.at(p_rotary_emb_sincos).get_node_shared_ptr();
+        auto slice_1 = pattern_map.at(p_slice_1).get_node_shared_ptr();
+        auto slice_2 = pattern_map.at(p_slice_2).get_node_shared_ptr();
+
+        auto axis = v0::Constant::create(element::i64, Shape{}, {1});
+        // in case of PagedAttention (Continuous batching) the rotary_emb_cos/rotary_emb_sin
+        // are used not in the sequential order, so we need to use position_ids to get the expected values.
+        auto gather = std::make_shared<v8::Gather>(slice_1->input_value(0), position_ids, axis);
+        gather->set_friendly_name(slice_2->get_friendly_name());
+        gather->validate_and_infer_types();
+
+        auto pshape = rotary_emb_sincos->get_output_partial_shape(0);
+        if (pshape.rank().is_dynamic() || pshape.rank().get_length() != 4) {
+            return false;
+        }
+
+        // PagedAttention expects the next layout for Q,K,V:
+        // [batch_size_in_tokens, num_kv_heads * head_size]
+        // so here we need to reshape the output tensor to move the seq dim (num tokens) to the batch
+        // num_kv_heads * head_size are already handled in the StateManagementPattern transformation
+        auto head_size = static_cast<int64_t>(pshape[3].get_length());
+        auto new_shape = v0::Constant::create(element::i64, Shape{4}, std::vector<int64_t>{-1, 1, 1, head_size});
+        auto reshape = std::make_shared<v1::Reshape>(gather, new_shape, false);
+        replace_node(slice_2, reshape);
+        return true;
+    };
+
+    auto m = std::make_shared<Matcher>(p_slice_2, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.cpp
index 36d9d88975b2e0..55d7af822c3857 100644
--- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.cpp
+++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.cpp
@@ -14,8 +14,9 @@
 
 using namespace ov::op;
 
-ov::pass::PrevSequenceLengthPattern::PrevSequenceLengthPattern(std::shared_ptr<ov::Node> prev_max_seq_len,
-                                                               std::shared_ptr<ov::Node> batch_dim) {
+ov::pass::PrevSequenceLengthPattern::PrevSequenceLengthPattern(const std::shared_ptr<ov::Node>& unsqueezed_input_ids,
+                                                               const std::shared_ptr<ov::Node>& max_context_len,
+                                                               const std::shared_ptr<ov::Node>& position_ids) {
     MATCHER_SCOPE(PrevSequenceLengthPattern);
     // The transformation addresses two cases that look similar: (1) previous sequence length, (2) batch size in
     // kv-cache state In first case it should replace it by prev_max_seq_len. For the second case, connect to batch_dim.
@@ -40,8 +41,16 @@ ov::pass::PrevSequenceLengthPattern::PrevSequenceLengthPattern(std::shared_ptr<o
         auto target_type = gather->get_output_element_type(0);
         std::shared_ptr<ov::Node> replacement;
         if (kv_init_shape[axis].is_static() && kv_init_shape[axis].get_length() == 0) {
+            auto cur_seq_len = std::make_shared<v8::Gather>(std::make_shared<v3::ShapeOf>(unsqueezed_input_ids),
+                                                            v0::Constant::create(element::i64, Shape{}, {1}),
+                                                            v0::Constant::create(element::i64, Shape{}, {0}));
+            auto cur_seq_len_i32 = std::make_shared<v0::Convert>(cur_seq_len, element::i32);
+            auto prev_max_seq_len = std::make_shared<v1::Subtract>(max_context_len, cur_seq_len_i32);
             replacement = prev_max_seq_len;
         } else {
+            // it is not always required, so will be disposed if not needed
+            auto batch_dim = std::make_shared<v3::ShapeOf>(position_ids);
+
             // assumption that any other axis should point to batch dimension, precise reasoning is too complex
             // TODO: provide more reliable check
             replacement = batch_dim;
diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp
index b55c3d73316120..a36085c34237a4 100644
--- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp
+++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp
@@ -437,6 +437,7 @@ ov::pass::StateManagementPattern::StateManagementPattern(ParameterVector& kv_par
             parameters_to_remove.push_back(param);
         }
 
+        pa_transpose->set_friendly_name(sdpa_node->get_friendly_name());
         replace_node(m.get_match_root(), pa_transpose);
         return true;
     };
diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.cpp
index 18387d5ca1ae04..cbf9426a0c82c5 100644
--- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.cpp
+++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/total_sequence_length_pattern.cpp
@@ -6,27 +6,49 @@
 
 #include "openvino/cc/pass/itt.hpp"
 #include "openvino/core/validation_util.hpp"
+#include "openvino/op/add.hpp"
 #include "openvino/op/concat.hpp"
 #include "openvino/op/gather.hpp"
+#include "openvino/op/reshape.hpp"
 #include "openvino/op/shape_of.hpp"
+#include "openvino/op/subtract.hpp"
+#include "openvino/op/unsqueeze.hpp"
+#include "openvino/pass/pattern/op/optional.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "transformations/utils/utils.hpp"
 
 using namespace ov::op;
+using namespace ov::pass::pattern;
+
+namespace {
+
+void align_replacement(std::shared_ptr<ov::Node>& replacement,
+                       const ov::PartialShape& required_shape,
+                       ov::element::Type target_type) {
+    if (replacement->get_output_element_type(0) != target_type) {
+        replacement = std::make_shared<v0::Convert>(replacement, target_type);
+    }
+
+    if (replacement->get_output_partial_shape(0) != required_shape && required_shape.rank().is_static()) {
+        replacement = ov::op::util::reshapeTo(replacement, ov::Shape(required_shape.rank().get_length(), 1));
+    }
+}
+
+}  // namespace
 
 ov::pass::TotalSequenceLengthPattern::TotalSequenceLengthPattern(
     const std::shared_ptr<ov::op::v0::Parameter>& max_context_len) {
     MATCHER_SCOPE(TotalSequenceLengthPattern);
 
-    auto kv_past = pattern::wrap_type<v6::ReadValue>({pattern::any_input()});
-    auto kv_gather = pattern::wrap_type<v8::Gather>({kv_past, pattern::any_input(), pattern::any_input()});
-    auto kv_current = pattern::any_input();
-    auto kv_concat = pattern::wrap_type<v0::Concat>({kv_gather, kv_current});
-    auto kv_shape = pattern::wrap_type<v3::ShapeOf>({kv_concat});
-    auto gather_idx_label = pattern::wrap_type<v0::Constant>();
-    auto seq = pattern::wrap_type<v8::Gather>({kv_shape, gather_idx_label, pattern::any_input()});
+    auto kv_past = wrap_type<v6::ReadValue>({any_input()});
+    auto kv_gather = wrap_type<v8::Gather>({kv_past, any_input(), any_input()});
+    auto kv_current = any_input();
+    auto kv_concat = wrap_type<v0::Concat>({kv_gather, kv_current});
+    auto kv_shape = wrap_type<v3::ShapeOf>({kv_concat});
+    auto gather_idx_label = wrap_type<v0::Constant>();
+    auto seq = wrap_type<v8::Gather>({kv_shape, gather_idx_label, any_input()});
 
-    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+    ov::matcher_pass_callback callback = [=](Matcher& m) {
         // TODO: Check that seq has axis that really takes sequence len but not any other dimension --
         //  use symbolic infra or look at the constant input
         const auto& pattern_map = m.get_pattern_value_map();
@@ -71,16 +93,8 @@ ov::pass::TotalSequenceLengthPattern::TotalSequenceLengthPattern(
 
         if (concat_axis_to_compare == gather_idx_to_compare) {
             auto target_type = gather->get_output_element_type(0);
-
-            if (replacement->get_output_element_type(0) != target_type) {
-                replacement = std::make_shared<v0::Convert>(replacement, target_type);
-            }
-
             auto required_shape = gather->get_output_partial_shape(0);
-
-            if (replacement->get_output_partial_shape(0) != required_shape && required_shape.rank().is_static()) {
-                replacement = op::util::reshapeTo(replacement, Shape(required_shape.rank().get_length(), 1));
-            }
+            align_replacement(replacement, required_shape, target_type);
         } else {
             // TODO: change in the future when we start supporting dynamic shapes here
             replacement = ov::util::get_constant_from_source(gather->output(0));
@@ -94,6 +108,41 @@ ov::pass::TotalSequenceLengthPattern::TotalSequenceLengthPattern(
         return true;
     };
 
-    auto m = std::make_shared<ov::pass::pattern::Matcher>(seq, matcher_name);
+    auto m = std::make_shared<Matcher>(seq, matcher_name);
+    register_matcher(m, callback);
+}
+
+ov::pass::TotalSequenceLengthPatternQwen::TotalSequenceLengthPatternQwen(
+    const std::shared_ptr<ov::op::v0::Parameter>& max_context_len) {
+    MATCHER_SCOPE(TotalSequenceLengthPatternQwen);
+
+    auto p_input_ids = wrap_type<v0::Parameter>();
+    auto p_unsqueeze = wrap_type<v0::Unsqueeze>({p_input_ids, any_input()});
+    auto p_opt_reshape_1 = optional<v1::Reshape>({p_unsqueeze, any_input()});
+    auto p_opt_convert_1 = optional<v0::Convert>(p_opt_reshape_1);
+    auto p_kv_shape_current = wrap_type<v3::ShapeOf>({p_opt_convert_1});
+    auto p_seq_current = wrap_type<v8::Gather>({p_kv_shape_current, any_input(), any_input()});
+    auto p_opt_convert_2 = optional<v0::Convert>(p_seq_current);
+
+    auto p_max_context_len = wrap_type<v0::Parameter>();
+    auto p_prev_max_seq_len = wrap_type<v1::Subtract>({p_max_context_len, any_input()});
+    auto p_opt_convert_3 = optional<v0::Convert>(p_prev_max_seq_len);
+    auto p_opt_reshape_2 = optional<v1::Reshape>({p_opt_convert_3, any_input()});
+    auto p_total_seq = wrap_type<v1::Add>({p_opt_convert_2, p_opt_reshape_2});
+
+    ov::matcher_pass_callback callback = [=](Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto total_seq = pattern_map.at(p_total_seq).get_node_shared_ptr();
+        std::shared_ptr<Node> replacement = max_context_len;
+
+        auto target_type = total_seq->get_output_element_type(0);
+        auto required_shape = total_seq->get_output_partial_shape(0);
+        align_replacement(replacement, required_shape, target_type);
+
+        replace_node(total_seq, replacement);
+        return true;
+    };
+
+    auto m = std::make_shared<Matcher>(p_total_seq, matcher_name);
     register_matcher(m, callback);
-}
\ No newline at end of file
+}
diff --git a/src/common/transformations/tests/common_optimizations/sdpa_fusion_test.cpp b/src/common/transformations/tests/common_optimizations/sdpa_fusion_test.cpp
new file mode 100644
index 00000000000000..52c10ba5967bd8
--- /dev/null
+++ b/src/common/transformations/tests/common_optimizations/sdpa_fusion_test.cpp
@@ -0,0 +1,234 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <openvino/core/model.hpp>
+#include <openvino/opsets/opset10.hpp>
+#include <openvino/pass/manager.hpp>
+#include <transformations/common_optimizations/sdpa_fusion.hpp>
+#include <transformations/utils/utils.hpp>
+
+#include "common_test_utils/ov_test_utils.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/softmax.hpp"
+#include "openvino/op/transpose.hpp"
+
+using namespace testing;
+using namespace ov::pass;
+using namespace ov;
+
+TEST_F(TransformationTestsF, SDPAFusionTest1) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f32, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f32, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f32, value_shape);
+    const auto casual = false;
+    {
+        const auto qk = std::make_shared<ov::op::v0::MatMul>(query, key, false, true);
+        const auto softmax = std::make_shared<ov::op::v8::Softmax>(qk, -1);
+        const auto qkv = std::make_shared<ov::op::v0::MatMul>(softmax, value, false, false);
+
+        model = std::make_shared<ov::Model>(NodeVector{qkv}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAFusion>();
+    }
+
+    {
+        const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{1.0f});
+        const auto mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{0.0f});
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
+                                                                                   key,
+                                                                                   value,
+                                                                                   mask_const,
+                                                                                   scale_const,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAFusionTest2) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f16, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f16, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f16, value_shape);
+    const auto casual = false;
+    {
+        const auto qk = std::make_shared<ov::op::v0::MatMul>(query, key, false, true);
+        const auto softmax = std::make_shared<ov::op::v8::Softmax>(qk, -1);
+        const auto qkv = std::make_shared<ov::op::v0::MatMul>(softmax, value, false, false);
+
+        model = std::make_shared<ov::Model>(NodeVector{qkv}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAFusion>();
+    }
+
+    {
+        const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{1.0f});
+        const auto mask_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{0.0f});
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
+                                                                                   key,
+                                                                                   value,
+                                                                                   mask_const,
+                                                                                   scale_const,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAFusionTest3) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f16, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f16, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f16, value_shape);
+    const auto casual = false;
+    {
+        const auto key_t =
+            std::make_shared<ov::op::v1::Transpose>(key,
+                                                    op::v0::Constant::create(element::i64, Shape{4}, {0, 1, 3, 2}));
+        const auto qk = std::make_shared<ov::op::v0::MatMul>(query, key_t, false, false);
+        const auto softmax = std::make_shared<ov::op::v8::Softmax>(qk, -1);
+        const auto qkv = std::make_shared<ov::op::v0::MatMul>(softmax, value, false, false);
+
+        model = std::make_shared<ov::Model>(NodeVector{qkv}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAFusion>();
+    }
+
+    {
+        const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{1.0f});
+        const auto mask_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{0.0f});
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
+                                                                                   key,
+                                                                                   value,
+                                                                                   mask_const,
+                                                                                   scale_const,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAFusionTest4) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, 32, -1};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f16, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f16, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f16, value_shape);
+    {
+        const auto qk = std::make_shared<ov::op::v0::MatMul>(query, key, false, false);
+        const auto softmax = std::make_shared<ov::op::v8::Softmax>(qk, -1);
+        const auto qkv = std::make_shared<ov::op::v0::MatMul>(softmax, value, false, false);
+
+        model = std::make_shared<ov::Model>(NodeVector{qkv}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAFusion>();
+    }
+
+    model_ref = model->clone();
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAFusionTest5) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+    const PartialShape attention_mask_shape{1, 32, -1, -1};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f16, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f16, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f16, value_shape);
+    const auto mask = std::make_shared<ov::op::v0::Parameter>(element::f16, attention_mask_shape);
+    const auto casual = false;
+    {
+        const auto qk = std::make_shared<ov::op::v0::MatMul>(query, key, false, true);
+        const auto mask_add = std::make_shared<ov::op::v1::Add>(qk, mask);
+        const auto softmax = std::make_shared<ov::op::v8::Softmax>(mask_add, -1);
+        const auto qkv = std::make_shared<ov::op::v0::MatMul>(softmax, value, false, false);
+
+        model = std::make_shared<ov::Model>(NodeVector{qkv}, ParameterVector{query, key, value, mask});
+        manager.register_pass<ov::pass::SDPAFusion>();
+    }
+
+    {
+        const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{1.0f});
+        const auto sdpa =
+            std::make_shared<ov::op::v13::ScaledDotProductAttention>(query, key, value, mask, scale_const, casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value, mask});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAFusionTest6) {
+    const PartialShape query_shape{1, 32, 10, 32};
+    const PartialShape key_shape{1, 32, 10, 32};
+    const PartialShape value_shape{1, 32, 10, 32};
+    const PartialShape attention_mask_shape{1, 1, 10, 10};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f16, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f16, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f16, value_shape);
+    const auto mask = std::make_shared<ov::op::v0::Parameter>(element::f16, attention_mask_shape);
+    const auto casual = false;
+    {
+        const auto qk = std::make_shared<ov::op::v0::MatMul>(query, key, false, true);
+        const auto mask_add = std::make_shared<ov::op::v1::Add>(qk, mask);
+        const auto softmax = std::make_shared<ov::op::v8::Softmax>(mask_add, -1);
+        const auto qkv = std::make_shared<ov::op::v0::MatMul>(softmax, value, false, false);
+
+        model = std::make_shared<ov::Model>(NodeVector{qkv}, ParameterVector{query, key, value, mask});
+        manager.register_pass<ov::pass::SDPAFusion>();
+    }
+
+    {
+        const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{1.0f});
+        const auto sdpa =
+            std::make_shared<ov::op::v13::ScaledDotProductAttention>(query, key, value, mask, scale_const, casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value, mask});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAFusionTest7) {
+    const PartialShape query_shape{1, 8, -1, 32};
+    const PartialShape key_shape{-1, 1, 8, 32};
+    const PartialShape value_shape{1, 8, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f16, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f16, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f16, value_shape);
+    {
+        const auto key_t =
+            std::make_shared<ov::op::v1::Transpose>(key,
+                                                    op::v0::Constant::create(element::i64, Shape{4}, {1, 2, 3, 0}));
+        const auto qk = std::make_shared<ov::op::v0::MatMul>(query, key_t, false, false);
+        const auto softmax = std::make_shared<ov::op::v8::Softmax>(qk, -1);
+        const auto qkv = std::make_shared<ov::op::v0::MatMul>(softmax, value, false, false);
+
+        model = std::make_shared<ov::Model>(NodeVector{qkv}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAFusion>();
+    }
+}
diff --git a/src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp b/src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp
new file mode 100644
index 00000000000000..f922f030a9c43b
--- /dev/null
+++ b/src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp
@@ -0,0 +1,228 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <openvino/core/model.hpp>
+#include <openvino/opsets/opset10.hpp>
+#include <openvino/pass/manager.hpp>
+#include <transformations/common_optimizations/sdpa_scale_fusion.hpp>
+#include <transformations/utils/utils.hpp>
+
+#include "common_test_utils/ov_test_utils.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+
+using namespace testing;
+using namespace ov::pass;
+using namespace ov;
+
+TEST_F(TransformationTestsF, SDPAScaleFusionTest1) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f32, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f32, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f32, value_shape);
+    const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{8.0f});
+    const auto v_scaled = std::make_shared<ov::op::v1::Multiply>(value, scale_const);
+    const auto casual = false;
+    {
+        const auto q_scaled = std::make_shared<ov::op::v1::Multiply>(query, scale_const);
+        const auto k_scaled = std::make_shared<ov::op::v1::Multiply>(key, scale_const);
+        const auto sdpa =
+            std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_scaled, k_scaled, v_scaled, casual);
+
+        model = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAScaleFusion>();
+    }
+
+    {
+        const auto new_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{0.0f});
+        const auto new_scale_const =
+            ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{64.0f / std::sqrt(32.0f)});
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
+                                                                                   key,
+                                                                                   v_scaled,
+                                                                                   new_mask_const,
+                                                                                   new_scale_const,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAScaleFusionTest2) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f32, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f32, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f32, value_shape);
+    const auto sdpa_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{0.0f});
+    const auto sdpa_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{2.0f});
+    const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{8.0f});
+    const auto v_scaled = std::make_shared<ov::op::v1::Multiply>(value, scale_const);
+    const auto casual = false;
+    {
+        const auto q_scaled = std::make_shared<ov::op::v1::Multiply>(query, scale_const);
+        const auto k_scaled = std::make_shared<ov::op::v1::Multiply>(key, scale_const);
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_scaled,
+                                                                                   k_scaled,
+                                                                                   v_scaled,
+                                                                                   sdpa_mask_const,
+                                                                                   sdpa_scale_const,
+                                                                                   casual);
+
+        model = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAScaleFusion>();
+    }
+
+    {
+        const auto new_scale_const =
+            ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{128.0f});
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
+                                                                                   key,
+                                                                                   v_scaled,
+                                                                                   sdpa_mask_const,
+                                                                                   new_scale_const,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAScaleFusionTest3) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f32, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f32, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f32, value_shape);
+    const auto sdpa_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{0.0f});
+    const auto sdpa_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{2.0f});
+    const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{8.0f});
+    const auto v_scaled = std::make_shared<ov::op::v1::Multiply>(value, scale_const);
+    const auto casual = false;
+    {
+        const auto q_scaled = std::make_shared<ov::op::v1::Multiply>(query, scale_const);
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_scaled,
+                                                                                   key,
+                                                                                   v_scaled,
+                                                                                   sdpa_mask_const,
+                                                                                   sdpa_scale_const,
+                                                                                   casual);
+
+        model = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAScaleFusion>();
+    }
+
+    {
+        const auto new_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{16.0f});
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
+                                                                                   key,
+                                                                                   v_scaled,
+                                                                                   sdpa_mask_const,
+                                                                                   new_scale_const,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAScaleFusionTest4) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f32, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f32, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f32, value_shape);
+    const auto sdpa_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{0.0f});
+    const auto sdpa_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{2.0f});
+    const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{8.0f});
+    const auto scale_dyn = std::make_shared<ov::op::v0::Parameter>(element::f32, ov::Shape{});
+    const auto v_scaled = std::make_shared<ov::op::v1::Multiply>(value, scale_const);
+    const auto casual = false;
+    const auto q_scaled = std::make_shared<ov::op::v1::Multiply>(query, scale_dyn);
+    {
+        const auto k_scaled = std::make_shared<ov::op::v1::Multiply>(key, scale_const);
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_scaled,
+                                                                                   k_scaled,
+                                                                                   v_scaled,
+                                                                                   sdpa_mask_const,
+                                                                                   sdpa_scale_const,
+                                                                                   casual);
+
+        model = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value, scale_dyn});
+        manager.register_pass<ov::pass::SDPAScaleFusion>();
+    }
+
+    {
+        const auto new_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{16.0f});
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_scaled,
+                                                                                   key,
+                                                                                   v_scaled,
+                                                                                   sdpa_mask_const,
+                                                                                   new_scale_const,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value, scale_dyn});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
+TEST_F(TransformationTestsF, SDPAScaleFusionTest5) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f32, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f32, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f32, value_shape);
+    const auto sdpa_mask_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{0.0f});
+    const auto sdpa_scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{1.0f});
+    const auto scale_const = ov::op::v0::Constant::create(element::f32, ov::Shape{}, std::vector<float>{1.0f});
+    const auto scale_dyn = std::make_shared<ov::op::v0::Parameter>(element::f32, ov::Shape{});
+    const auto v_scaled = std::make_shared<ov::op::v1::Multiply>(value, scale_const);
+    const auto casual = false;
+    {
+        const auto q_scaled = std::make_shared<ov::op::v1::Multiply>(query, scale_dyn);
+        const auto k_scaled = std::make_shared<ov::op::v1::Multiply>(key, scale_const);
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_scaled,
+                                                                                   k_scaled,
+                                                                                   v_scaled,
+                                                                                   sdpa_mask_const,
+                                                                                   sdpa_scale_const,
+                                                                                   casual);
+
+        model = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value, scale_dyn});
+        manager.register_pass<ov::pass::SDPAScaleFusion>();
+    }
+
+    {
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
+                                                                                   key,
+                                                                                   v_scaled,
+                                                                                   sdpa_mask_const,
+                                                                                   scale_dyn,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value, scale_dyn});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
diff --git a/src/common/transformations/tests/op_conversions/sdpa_to_paged_attention_test.cpp b/src/common/transformations/tests/op_conversions/sdpa_to_paged_attention_test.cpp
new file mode 100644
index 00000000000000..840309993c939a
--- /dev/null
+++ b/src/common/transformations/tests/op_conversions/sdpa_to_paged_attention_test.cpp
@@ -0,0 +1,618 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/pass/sdpa_to_paged_attention.hpp"
+
+#include <gtest/gtest.h>
+
+#include "common_test_utils/ov_test_utils.hpp"
+#include "openvino/core/model.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/broadcast.hpp"
+#include "openvino/op/concat.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/divide.hpp"
+#include "openvino/op/gather.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/ops.hpp"
+#include "openvino/op/paged_attention.hpp"
+#include "openvino/op/power.hpp"
+#include "openvino/op/reduce_mean.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+#include "openvino/op/select.hpp"
+#include "openvino/op/shape_of.hpp"
+#include "openvino/op/sqrt.hpp"
+#include "openvino/op/squeeze.hpp"
+#include "openvino/op/subtract.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/op/unsqueeze.hpp"
+#include "transformations/sdpa_to_paged_attention/prev_sequence_length_pattern.hpp"
+#include "transformations/sdpa_to_paged_attention/total_sequence_length_pattern.hpp"
+#include "transformations/utils/gen_pattern.hpp"
+#include "transformations/utils/print_model.hpp"
+
+using namespace ov;
+using namespace std;
+using namespace testing;
+using namespace ov::op;
+using namespace ov::gen_pattern;
+
+namespace {
+
+// Constants and Parameters attributes:
+auto el_type_i64 = std::pair<std::string, detail::AttrAny>({"element_type", "i64"});
+auto el_type_i32 = std::pair<std::string, detail::AttrAny>({"element_type", "i32"});
+auto el_type_f32 = std::pair<std::string, detail::AttrAny>({"element_type", "f32"});
+
+// Convert ops attributes:
+auto dest_type_i64 = std::pair<std::string, detail::AttrAny>({"destination_type", "i64"});
+auto dest_type_f32 = std::pair<std::string, detail::AttrAny>({"destination_type", "f32"});
+auto dest_type_f16 = std::pair<std::string, detail::AttrAny>({"destination_type", "f16"});
+
+// Other attributes:
+auto numpy_broadcast = std::pair<std::string, detail::AttrAny>({"auto_broadcast", "numpy"});
+auto special_zero_true = std::pair<std::string, detail::AttrAny>({"special_zero", true});
+
+auto single_val = [](int rank, float val) {
+    return makeConst(element::f32, ov::Shape{std::vector<size_t>(rank, 1)}, {val});
+};
+
+ov::ParameterVector nodes_to_params(const ov::NodeVector& node_vec) {
+    ov::ParameterVector params;
+    params.reserve(node_vec.size());
+    for (const auto& node : node_vec) {
+        params.push_back(ov::as_type_ptr<v0::Parameter>(node));
+    }
+    return params;
+}
+
+enum QKV : int { Q = 0, K = 1, V = 2 };
+vector<int> MOCK_VALUE = {1};
+
+// original weights = 151936, attention_weights = 12288
+#define WEIGHTS           1024
+#define ATTENTION_WEIGHTS 512
+
+class Qwen7bChatSDPA {
+public:
+    static std::shared_ptr<Node> gen_embeddings(const std::shared_ptr<Node>& input_ids) {
+        auto view_reshape = makeOP<v1::Reshape>({input_ids, {-1, 0}}, {special_zero_true});
+        auto input_ids_i64 = makeOP<v0::Convert>({view_reshape}, {dest_type_i64});
+
+        auto weights = makeConst(element::u8, {WEIGHTS, 4096}, MOCK_VALUE);
+        auto weights_fp16 = makeOP<v0::Convert>({weights}, {dest_type_f16});
+        auto zero_point = makeConst(element::u8, {WEIGHTS, 1}, MOCK_VALUE);
+        auto zero_point_fp16 = makeOP<v0::Convert>({zero_point}, {dest_type_f16});
+        auto zero_point_subtract = makeOP<v1::Subtract>({weights_fp16, zero_point_fp16}, {numpy_broadcast});
+
+        auto scale = makeConst(element::f16, {WEIGHTS, 1}, MOCK_VALUE);
+        auto mul_scale = makeOP<v1::Multiply>({zero_point_subtract, scale}, {numpy_broadcast});
+        auto fq_weights = makeOP<v0::Convert>({mul_scale}, {dest_type_f32});
+
+        return makeOP<v8::Gather>({fq_weights, input_ids_i64, 0}, {{"batch_dims", 0}});
+    }
+
+    static std::shared_ptr<Node> gen_attention_weights() {
+        auto weights = makeConst(element::u8, {ATTENTION_WEIGHTS, 4096}, MOCK_VALUE);
+        auto weights_f16 = makeOP<v0::Convert>({weights}, {dest_type_f16});
+
+        auto zero_points = makeConst(element::u8, {ATTENTION_WEIGHTS, 1}, MOCK_VALUE);
+        auto zero_points_f16 = makeOP<v0::Convert>({zero_points}, {dest_type_f16});
+        auto subtract = makeOP<v1::Subtract>({weights_f16, zero_points_f16}, {numpy_broadcast});
+
+        auto scale = makeConst(element::f16, {ATTENTION_WEIGHTS, 1}, MOCK_VALUE);
+        auto mul = makeOP<v1::Multiply>({subtract, scale}, {numpy_broadcast});
+        return makeOP<v0::Convert>({mul}, {dest_type_f32});
+    }
+
+    static std::shared_ptr<Node> gen_qkv_proj(const std::shared_ptr<Node>& embeddings) {
+        auto _const_0 = single_val(/*rank*/ 3, /*val*/ 2);
+        auto pow = makeOP<v1::Power>({embeddings, _const_0}, {numpy_broadcast});
+        auto mean = makeOP<v1::ReduceMean>({pow, {-1}}, {{"keep_dims", true}});
+
+        auto _const_1 = single_val(/*rank*/ 3, /*val*/ 1);
+        auto add = makeOP<v1::Add>({mean, _const_1}, {numpy_broadcast});
+        auto sqrt = makeOP<v0::Sqrt>({add});
+
+        auto _const_2 = single_val(/*rank*/ 3, /*val*/ 1);
+        auto div = makeOP<v1::Divide>({_const_2, sqrt}, {numpy_broadcast, {"m_pythondiv", true}});
+        auto mul_0 = makeOP<v1::Multiply>({embeddings, div}, {numpy_broadcast});
+
+        auto _const_3 = makeConst(element::f32, {1, 1, 4096}, MOCK_VALUE);
+        auto mul_1 = makeOP<v1::Multiply>({mul_0, _const_3}, {numpy_broadcast});
+        auto attention_weights = gen_attention_weights();
+        auto linear_matmul =
+            makeOP<v0::MatMul>({mul_1, attention_weights}, {{"transpose_a", false}, {"transpose_b", true}});
+
+        auto _const_4 = makeConst(element::f32, {1, 1, ATTENTION_WEIGHTS}, MOCK_VALUE);
+        auto linear_add = makeOP<v1::Add>({linear_matmul, _const_4}, {numpy_broadcast});
+        return makeOP<v1::VariadicSplit>({linear_add, 2, {4096, 4096, -1}});
+    }
+
+    static std::shared_ptr<Node> gen_cache(const std::shared_ptr<Node>& input_ids,
+                                           const std::shared_ptr<Node>& beam_idx,
+                                           const std::string& name) {
+        auto shape_of = makeOP<v3::ShapeOf>({input_ids}, {{"output_type", "i64"}});
+        auto gather = makeOP<v8::Gather>({shape_of, {0}, 0}, {{"batch_dims", 0}});
+        auto concat = makeOP<v0::Concat>({gather, {0ll}, {32ll}, {128ll}}, {{"axis", 0}});
+        auto init_to_read = makeOP<v1::Broadcast>({0.000000f, concat}, {{"mode", "numpy"}});
+        auto cache = makeOP<v6::ReadValue>(
+            {init_to_read},
+            {{"variable_id", name}, {"variable_type", "f32"}, {"variable_shape", PartialShape{DYN, DYN, 32, 128}}});
+        return makeOP<v8::Gather>({cache, beam_idx, 0}, {{"batch_dims", 0}});
+    }
+
+    static std::shared_ptr<Node> gen_current_len(const std::shared_ptr<Node>& input_ids) {
+        auto shape_of = makeOP<v3::ShapeOf>({input_ids}, {{"output_type", "i64"}});
+        return makeOP<v8::Gather>({shape_of, {1}, 0}, {{"batch_dims", 0}});
+    }
+
+    static std::shared_ptr<Node> gen_past_len(const std::shared_ptr<Node>& k_cache) {
+        auto shape_of = makeOP<v3::ShapeOf>({k_cache}, {{"output_type", "i64"}});
+        return makeOP<v8::Gather>({shape_of, {1}, 0}, {{"batch_dims", 0}});
+    }
+
+    static std::shared_ptr<Node> gen_total_len(const std::shared_ptr<Node>& cur_len,
+                                               const std::shared_ptr<Node>& past_len) {
+        return makeOP<v1::Add>({cur_len, past_len}, {numpy_broadcast});
+    }
+
+    static std::shared_ptr<Node> gen_rope(QKV idx,
+                                          const std::shared_ptr<Node>& qkv_proj,
+                                          const std::shared_ptr<Node>& head_size,
+                                          const std::shared_ptr<Node>& sliced_sin,
+                                          const std::shared_ptr<Node>& sliced_cos) {
+        auto current_k = makeOP<v1::Reshape>({qkv_proj->output(idx), {0, 0, 32, 128}}, {special_zero_true});
+        auto sliced_k = makeOP<v8::Slice>({current_k, {0}, head_size, {1}, {3}});
+        auto mul_1 = makeOP<v1::Multiply>({sliced_k, sliced_cos}, {numpy_broadcast});
+
+        auto reshape = makeOP<v1::Reshape>({sliced_k, {0, 0, 32, 2, 64}}, {special_zero_true});
+        auto split_1 = makeOP<v1::Split>({reshape, -2}, {{"num_splits", 2}});
+        auto list_unpack_1 = makeOP<v0::Squeeze>({split_1->output(1), -2});
+
+        auto _const = single_val(/*rank*/ 4, /*val*/ 1);
+        auto mul_2 = makeOP<v1::Multiply>({list_unpack_1, _const}, {numpy_broadcast});
+        auto list_unpack_2 = makeOP<v0::Squeeze>({split_1->output(0), -2});
+        auto concat = makeOP<v0::Concat>({mul_2, list_unpack_2}, {{"axis", -1}});
+
+        auto mul_3 = makeOP<v1::Multiply>({concat, sliced_sin}, {numpy_broadcast});
+        return makeOP<v1::Add>({mul_1, mul_3}, {numpy_broadcast});
+    }
+
+    static std::shared_ptr<Node> gen_rope_emb_sin(const std::shared_ptr<Node>& total_seq_len,
+                                                  const std::shared_ptr<Node>& neg_mul,
+                                                  std::shared_ptr<Node>& head_size) {
+        auto sin = makeConst(element::f32, {1, 4096, 1, 128}, MOCK_VALUE);
+        auto sliced_sin_by_total = makeOP<v8::Slice>({sin, {0}, total_seq_len, {1}, {1}});
+        auto rotary_emb_sin_shape = makeOP<v3::ShapeOf>({sliced_sin_by_total}, {{"output_type", "i64"}});
+        head_size = makeOP<v8::Gather>({rotary_emb_sin_shape, {3}, 0}, {{"batch_dims", 0}});
+        return makeOP<v8::Slice>({sliced_sin_by_total, neg_mul, {LLONG_MAX}, {1}, {1}});
+    }
+
+    static std::shared_ptr<Node> gen_rope_emb_cos(const std::shared_ptr<Node>& total_seq_len,
+                                                  const std::shared_ptr<Node>& neg_mul) {
+        auto cos = makeConst(element::f32, {1, 4096, 1, 128}, MOCK_VALUE);
+        auto sliced_cos_by_total = makeOP<v8::Slice>({cos, {0}, total_seq_len, {1}, {1}});
+        return makeOP<v8::Slice>({sliced_cos_by_total, neg_mul, {LLONG_MAX}, {1}, {1}});
+    }
+
+    static std::shared_ptr<Node> neg_mul(const std::shared_ptr<Node>& current_seq_len) {
+        return makeOP<v1::Multiply>({current_seq_len, {-1ll}}, {numpy_broadcast});
+    }
+
+    static std::shared_ptr<Node> gen_V(const std::shared_ptr<Node>& cache, const std::shared_ptr<Node>& qkv_proj) {
+        auto v_current = makeOP<v1::Reshape>({qkv_proj->output(2), {0, 0, 32, 128}}, {special_zero_true});
+        auto v_total = makeOP<v0::Concat>({cache, v_current}, {{"axis", 1}});
+        return makeOP<v1::Transpose>({v_total, {0, 2, 1, 3}});
+    }
+
+    static std::shared_ptr<Node> gen_K(const std::shared_ptr<Node>& cache, const std::shared_ptr<Node>& rope_K) {
+        auto full_k = makeOP<v0::Concat>({cache, rope_K}, {{"axis", 1}});
+        return makeOP<v1::Transpose>({full_k, {0, 2, 1, 3}});
+    }
+
+    static std::shared_ptr<Node> gen_Q(const std::shared_ptr<Node>& past_seq_len_2,
+                                       const std::shared_ptr<Node>& total_seq_len_2,
+                                       const std::shared_ptr<Node>& rope_Q) {
+        auto _const = makeConst(element::f32, {1, 32767, 1, 1}, MOCK_VALUE);
+        auto slice = makeOP<v8::Slice>({_const, past_seq_len_2, total_seq_len_2, {1}, {1}});
+        auto mul = makeOP<v1::Multiply>({rope_Q, slice}, {numpy_broadcast});
+        return makeOP<v1::Transpose>({mul, {0, 2, 1, 3}});
+    }
+
+    static std::shared_ptr<Node> gen_total_seq_len_2(const std::shared_ptr<Node>& past_k_len,
+                                                     const std::shared_ptr<Node>& rope_k) {
+        auto shape_rope_k = makeOP<v3::ShapeOf>({rope_k}, {{"output_type", "i64"}});
+        auto cur_len = makeOP<v8::Gather>({shape_rope_k, {1}, 0}, {{"batch_dims", 0}});
+        return makeOP<v1::Add>({past_k_len, cur_len}, {numpy_broadcast});
+    }
+
+    static std::shared_ptr<Node> gen_past_seq_len_2(const std::shared_ptr<Node>& total_seq_len,
+                                                    const std::shared_ptr<Node>& rope_q) {
+        auto shape_rope_q = makeOP<v3::ShapeOf>({rope_q}, {{"output_type", "i64"}});
+        auto cur_len = makeOP<v8::Gather>({shape_rope_q, {1}, 0}, {{"batch_dims", 0}});
+        return makeOP<v1::Subtract>({total_seq_len, cur_len}, {numpy_broadcast});
+    }
+
+    static std::shared_ptr<Node> gen_attention_mask(const std::shared_ptr<Node>& Q_in,
+                                                    const std::shared_ptr<Node>& attention_mask_in,
+                                                    const std::shared_ptr<Node>& total_seq_len) {
+        auto _const = makeConst(element::boolean, {1, 1, 8192, 8192}, MOCK_VALUE);
+        auto shape_of_q = makeOP<v3::ShapeOf>({Q_in}, {{"output_type", "i64"}});
+        auto gather = makeOP<v8::Gather>({shape_of_q, {2}, 0}, {{"batch_dims", 0}});
+        auto sub_1 = makeOP<v1::Subtract>({total_seq_len, gather}, {numpy_broadcast});
+        auto concat = makeOP<v0::Concat>({sub_1, {0ll}}, {{"axis", 0}});
+        auto broadcast = makeOP<v3::Broadcast>({total_seq_len, {2}}, {{"mode", "numpy"}});
+        auto slice = makeOP<v8::Slice>({_const, concat, broadcast, {1, 1}, {2, 3}});
+        auto bitwise_not = makeOP<v13::BitwiseNot>({slice});
+
+        auto _const_1 = single_val(/*rank*/ 4, /*val*/ 1);
+        auto view_reshape = makeOP<v1::Reshape>({attention_mask_in, {0, 0}}, {special_zero_true});
+        auto unsqueeze_0 = makeOP<v0::Unsqueeze>({view_reshape, 1});
+        auto unsqueeze_1 = makeOP<v0::Unsqueeze>({unsqueeze_0, 2});
+        auto convert_0 = makeOP<v0::Convert>({unsqueeze_1}, {dest_type_f32});
+
+        auto _const_2 = single_val(/*rank*/ 4, /*val*/ 1);
+        auto mul_1 = makeOP<v1::Multiply>({convert_0, _const_2}, {numpy_broadcast});
+        auto sub_2 = makeOP<v1::Subtract>({_const_1, mul_1}, {numpy_broadcast});
+
+        auto _const_3 = single_val(/*rank*/ 4, /*val*/ 1);
+        auto mul_2 = makeOP<v1::Multiply>({sub_2, _const_3}, {numpy_broadcast});
+        auto list_construct = makeOP<v0::Concat>({{1ll}, {1ll}, gather, {1ll}}, {{"axis", 0}});
+        auto expand_broadcast = makeOP<v3::Broadcast>({mul_2, list_construct}, {{"mode", "bidirectional"}});
+        return makeOP<v1::Select>({bitwise_not, -FLT_MAX, expand_broadcast}, {numpy_broadcast});
+    }
+};
+
+class Qwen7bChatPA {
+public:
+    static std::shared_ptr<Node> gen_embeddings(const std::shared_ptr<Node>& input_ids) {
+        auto weights = makeConst(element::u8, {WEIGHTS, 4096}, MOCK_VALUE);
+        auto weights_fp16 = makeOP<v0::Convert>({weights}, {dest_type_f16});
+
+        auto zero_point = makeConst(element::u8, {WEIGHTS, 1}, MOCK_VALUE);
+        auto zero_point_fp16 = makeOP<v0::Convert>({zero_point}, {dest_type_f16});
+        auto sub = makeOP<v1::Subtract>({weights_fp16, zero_point_fp16}, {numpy_broadcast});
+
+        auto scale = makeConst(element::f16, {WEIGHTS, 1}, MOCK_VALUE);
+        auto mul = makeOP<v1::Multiply>({sub, scale}, {numpy_broadcast});
+        auto mul_fp32 = makeOP<v0::Convert>({mul}, {dest_type_f32});
+
+        auto reshape_view = makeOP<v1::Reshape>({input_ids, {-1, 0}}, {special_zero_true});
+        auto reshape_view_i64 = makeOP<v0::Convert>({reshape_view}, {dest_type_i64});
+        return makeOP<v8::Gather>({mul_fp32, reshape_view_i64, 0}, {{"batch_dims", 0}});
+    }
+
+    static std::shared_ptr<Node> gen_qkv_proj(const std::shared_ptr<Node>& embeddings) {
+        auto _const_0 = makeConst(element::f32, {1, 1, 1}, MOCK_VALUE);
+        auto pow = makeOP<v1::Power>({embeddings, _const_0}, {numpy_broadcast});
+        auto mean = makeOP<v1::ReduceMean>({pow, {-1}}, {{"keep_dims", true}});
+        auto _const_1 = makeConst(element::f32, {1, 1, 1}, MOCK_VALUE);
+        auto add_0 = makeOP<v1::Add>({mean, _const_1}, {numpy_broadcast});
+
+        auto sqrt = makeOP<v0::Sqrt>({add_0});
+        auto _const_2 = makeConst(element::f32, {1, 1, 1}, MOCK_VALUE);
+        auto div = makeOP<v1::Divide>({_const_2, sqrt}, {numpy_broadcast, {"m_pythondiv", true}});
+        auto mul_0 = makeOP<v1::Multiply>({embeddings, div}, {numpy_broadcast});
+
+        auto _const_3 = makeConst(element::f32, {1, 1, 4096}, MOCK_VALUE);
+        auto mul_1 = makeOP<v1::Multiply>({mul_0, _const_3}, {numpy_broadcast});
+
+        auto _const_4 = makeConst(element::u8, {ATTENTION_WEIGHTS, 4096}, MOCK_VALUE);
+        auto convert_0 = makeOP<v0::Convert>({_const_4}, {dest_type_f16});
+
+        auto _const_5 = makeConst(element::u8, {ATTENTION_WEIGHTS, 1}, MOCK_VALUE);
+        auto convert_1 = makeOP<v0::Convert>({_const_5}, {dest_type_f16});
+        auto sub = makeOP<v1::Subtract>({convert_0, convert_1}, {numpy_broadcast});
+
+        auto _const_6 = makeConst(element::f16, {ATTENTION_WEIGHTS, 1}, MOCK_VALUE);
+        auto mul_2 = makeOP<v1::Multiply>({sub, _const_6}, {numpy_broadcast});
+        auto convert_2 = makeOP<v0::Convert>({mul_2}, {dest_type_f32});
+        auto matmul = makeOP<v0::MatMul>({mul_1, convert_2}, {{"transpose_a", false}, {"transpose_b", true}});
+        auto Constant_270 = makeConst(element::f32, {1, 1, ATTENTION_WEIGHTS}, MOCK_VALUE);
+        auto add_1 = makeOP<v1::Add>({matmul, Constant_270}, {numpy_broadcast});
+
+        return makeOP<v1::VariadicSplit>({add_1, 2, {4096, 4096, -1}});
+    }
+
+    static std::shared_ptr<Node> gen_rope(QKV idx,
+                                          const std::shared_ptr<Node>& qkv_proj,
+                                          const std::shared_ptr<Node>& head_size,
+                                          const std::shared_ptr<Node>& sin,
+                                          const std::shared_ptr<Node>& cos) {
+        auto Q_or_K = makeOP<v1::Reshape>({qkv_proj->output(idx), {0, 0, 32, 128}}, {special_zero_true});
+        auto sliced = makeOP<v8::Slice>({Q_or_K, {0}, head_size, {1}, {3}});
+        auto mul_0 = makeOP<v1::Multiply>({sliced, sin}, {numpy_broadcast});
+
+        auto reshape = makeOP<v1::Reshape>({sliced, {0, 0, 32, 2, 64}}, {special_zero_true});
+        auto split = makeOP<v1::Split>({reshape, -2}, {{"num_splits", 2}});
+        auto squeeze_0 = makeOP<v0::Squeeze>({split->output(1), -2});
+        auto _const_0 = makeConst(element::f32, {1, 1, 1, 1}, {1.000000f});
+        auto mul_1 = makeOP<v1::Multiply>({squeeze_0, _const_0}, {numpy_broadcast});
+
+        auto squeeze_1 = makeOP<v0::Squeeze>({split->output(0), -2});
+        auto concat = makeOP<v0::Concat>({mul_1, squeeze_1}, {{"axis", -1}});
+        auto mul_2 = makeOP<v1::Multiply>({concat, cos}, {numpy_broadcast});
+        return makeOP<v1::Add>({mul_0, mul_2}, {numpy_broadcast});
+    }
+
+    static std::shared_ptr<Node> gen_rope_emb_sin(const std::shared_ptr<Node>& max_context_len,
+                                                  const std::shared_ptr<Node>& position_ids,
+                                                  std::shared_ptr<Node>& head_size) {
+        auto sin = makeConst(element::f32, {1, 4096, 1, 128}, MOCK_VALUE);
+        auto slice_sin = makeOP<v8::Gather>({sin, position_ids, 1}, {{"batch_dims", 0}});
+
+        auto slice = makeOP<v8::Slice>({sin, {0}, max_context_len, {1}, {1}});
+        auto shape_of = makeOP<opset3::ShapeOf>({slice}, {{"output_type", "i64"}});
+        head_size = makeOP<v8::Gather>({shape_of, {3}, 0}, {{"batch_dims", 0}});
+
+        return makeOP<v1::Reshape>({slice_sin, {-1, 1, 1, 128}}, {{"special_zero", false}});
+    }
+
+    static std::shared_ptr<Node> gen_rope_emb_cos(const std::shared_ptr<Node>& max_context_len,
+                                                  const std::shared_ptr<Node>& position_ids) {
+        auto cos = makeConst(element::f32, {1, 4096, 1, 128}, MOCK_VALUE);
+        auto slice = makeOP<v8::Gather>({cos, position_ids, 1}, {{"batch_dims", 0}});
+        return makeOP<v1::Reshape>({slice, {-1, 1, 1, 128}}, {{"special_zero", false}});
+    }
+
+    static std::shared_ptr<Node> align_pa_layout(const std::shared_ptr<Node>& pa,
+                                                 const std::shared_ptr<Node>& head_size) {
+        auto shape = makeOP<v0::Concat>({{0ll}, {1ll}, {-1ll}, head_size}, {{"axis", 0}});
+        auto reshaped = makeOP<v1::Reshape>({pa->output(0), shape}, {special_zero_true});
+        return makeOP<v1::Transpose>({reshaped, {0, 2, 1, 3}});
+    }
+
+    static std::shared_ptr<Node> gen_current_len(const std::shared_ptr<Node>& rope_K) {
+        auto shape_of = makeOP<opset3::ShapeOf>({rope_K}, {{"output_type", "i32"}});
+        return makeOP<v8::Gather>({shape_of, {1}, 0ll}, {{"batch_dims", 0}});
+    }
+
+    static std::shared_ptr<Node> gen_past_len(const std::shared_ptr<Node>& input_ids,
+                                              const std::shared_ptr<Node>& max_context_len) {
+        auto shape_of = makeOP<opset3::ShapeOf>({input_ids}, {{"output_type", "i64"}});
+        auto cur_len = makeOP<v8::Gather>({shape_of, 1ll, 0ll}, {{"batch_dims", 0}});
+        auto cur_len_i32 = makeOP<v0::Convert>({cur_len}, {{"destination_type", "i32"}});
+
+        auto past_len = makeOP<v1::Subtract>({max_context_len, cur_len_i32}, {numpy_broadcast});
+        auto past_len_i32 = makeOP<v0::Convert>({past_len}, {{"destination_type", "i32"}});
+        return makeOP<v1::Reshape>({past_len_i32, {1}}, {special_zero_true});
+    }
+
+    static std::shared_ptr<Node> gen_total_len(const std::shared_ptr<Node>& cur_len,
+                                               const std::shared_ptr<Node>& past_len) {
+        return makeOP<v1::Add>({past_len, cur_len}, {numpy_broadcast});
+    }
+
+    static std::shared_ptr<Node> gen_V(const std::shared_ptr<Node>& qkv_proj, std::shared_ptr<Node>& head_size) {
+        auto current_V = makeOP<v1::Reshape>({qkv_proj->output(2), {0, 0, 32, 128}}, {special_zero_true});
+        auto gather = makeOP<v8::Gather>({{0, 2, 1, 3}, {0, 2, 1, 3}, 0ll}, {{"batch_dims", 0}});
+        auto transpose = makeOP<v1::Transpose>({current_V, gather});
+
+        auto shape_of = makeOP<opset3::ShapeOf>({transpose}, {{"output_type", "i64"}});
+        auto gather_2 = makeOP<v8::Gather>({shape_of, -1ll, 0ll}, {{"batch_dims", 0}});
+        head_size = makeOP<v0::Unsqueeze>({gather_2, 0});
+
+        return makeOP<v1::Reshape>({transpose, {0, -1}}, {special_zero_true});
+    }
+
+    static std::shared_ptr<Node> gen_K(const std::shared_ptr<Node>& rope_K) {
+        auto gather = makeOP<v8::Gather>({{0, 2, 1, 3}, {0, 2, 1, 3}, 0ll}, {{"batch_dims", 0}});
+        auto transpose = makeOP<v1::Transpose>({rope_K, gather});
+        return makeOP<v1::Reshape>({transpose, {0, -1}}, {special_zero_true});
+    }
+
+    static std::shared_ptr<Node> gen_Q(const std::shared_ptr<Node>& total_seq_len,
+                                       const std::shared_ptr<Node>& rope_Q) {
+        auto _const_1 = makeConst(element::f32, {1, 32767, 1, 1}, MOCK_VALUE);
+        auto shape_of = makeOP<opset3::ShapeOf>({rope_Q}, {{"output_type", "i32"}});
+        auto current_seq_len = makeOP<v8::Gather>({shape_of, {1}, 0ll}, {{"batch_dims", 0}});
+        auto past_seq_len = makeOP<v1::Subtract>({total_seq_len, current_seq_len}, {numpy_broadcast});
+
+        auto slice = makeOP<v8::Slice>({_const_1, past_seq_len, total_seq_len, {1}, {1}});
+        auto mul = makeOP<v1::Multiply>({rope_Q, slice}, {numpy_broadcast});
+        auto transpose_1 = makeOP<v1::Transpose>({mul, {0, 2, 1, 3}});
+
+        auto transpose_2 = makeOP<v1::Transpose>({transpose_1, {0, 2, 1, 3}});
+        return makeOP<v1::Reshape>({transpose_2, {0, -1}}, {special_zero_true});
+    }
+};
+
+}  // namespace
+
+TEST_F(TransformationTestsF, SDPAToPA_Qwen) {
+    {
+        // Inputs to SDPA transformer:
+        auto beam_idx = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN}}, el_type_i64});
+        auto position_ids = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN, DYN}}, el_type_i64});
+        auto attention_mask = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN, DYN}}, el_type_i64});
+        auto input_ids = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN, DYN}}, el_type_i64});
+        ParameterVector params = nodes_to_params({position_ids, input_ids, attention_mask, beam_idx});
+
+        beam_idx->output(0).add_names({"beam_idx"});
+        position_ids->output(0).add_names({"position_ids"});
+        attention_mask->output(0).add_names({"attention_mask"});
+        input_ids->output(0).add_names({"input_ids"});
+
+        // Embeddings processing:
+        auto embeddings = Qwen7bChatSDPA::gen_embeddings(input_ids);
+        auto qkv_proj = Qwen7bChatSDPA::gen_qkv_proj(embeddings);
+
+        // KV cache:
+        auto k_cache = Qwen7bChatSDPA::gen_cache(input_ids, beam_idx, "K_cache");
+        auto v_cache = Qwen7bChatSDPA::gen_cache(input_ids, beam_idx, "V_cache");
+
+        // Current/past/total Seq lengths calculation:
+        auto current_seq_len = Qwen7bChatSDPA::gen_current_len(input_ids);
+        auto past_seq_len = Qwen7bChatSDPA::gen_past_len(k_cache);
+        auto total_seq_len = Qwen7bChatSDPA::gen_total_len(current_seq_len, past_seq_len);
+
+        // RoPE emb sin/cos init:
+        auto neg_cur_seq_len = Qwen7bChatSDPA::neg_mul(current_seq_len);
+        auto head_size = shared_ptr<Node>();
+        auto rope_emb_sin = Qwen7bChatSDPA::gen_rope_emb_sin(total_seq_len, neg_cur_seq_len, head_size);
+        auto rope_emb_cos = Qwen7bChatSDPA::gen_rope_emb_cos(total_seq_len, neg_cur_seq_len);
+
+        // RoPE for Q,K inputs:
+        auto rope_q = Qwen7bChatSDPA::gen_rope(QKV::Q, qkv_proj, head_size, rope_emb_sin, rope_emb_cos);
+        auto rope_k = Qwen7bChatSDPA::gen_rope(QKV::K, qkv_proj, head_size, rope_emb_sin, rope_emb_cos);
+
+        // Lengths:
+        auto total_seq_len_2 = Qwen7bChatSDPA::gen_total_seq_len_2(past_seq_len, rope_k);
+        auto past_seq_len_2 = Qwen7bChatSDPA::gen_past_seq_len_2(total_seq_len_2, rope_q);
+
+        // Q, K, V:
+        auto Q = Qwen7bChatSDPA::gen_Q(past_seq_len_2, total_seq_len_2, rope_q);
+        auto K = Qwen7bChatSDPA::gen_K(k_cache, rope_k);
+        auto V = Qwen7bChatSDPA::gen_V(v_cache, qkv_proj);
+
+        // Attention mask:
+        auto attention_mask_to_sdpa = Qwen7bChatSDPA::gen_attention_mask(Q, attention_mask, total_seq_len_2);
+
+        // SDPA:
+        auto sdpa = makeOP<v13::ScaledDotProductAttention>({Q, K, V, attention_mask_to_sdpa}, {{"causal", false}});
+        auto res = makeOP<v0::Result>({sdpa});
+
+        model = std::make_shared<ov::Model>(OutputVector{res}, params);
+        manager.register_pass<ov::pass::SDPAToPagedAttention>();
+    }
+
+    {
+        // Inputs to PA transformer:
+        auto max_context_len = makeOP<v0::Parameter>({}, {{"shape", PartialShape{}}, el_type_i32});
+        auto block_indices_begins = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN}}, el_type_i32});
+        auto block_indices = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN}}, el_type_i32});
+        auto subsequence_begins = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN}}, el_type_i32});
+        auto past_lens = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN}}, el_type_i32});
+        auto value_cache_0 = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN, 32, 128}}, el_type_f32});
+        auto key_cache_0 = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN, 32, 128}}, el_type_f32});
+        auto input_ids = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN}}, el_type_i64});
+        auto position_ids = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN}}, el_type_i64});
+        auto params = nodes_to_params({max_context_len,
+                                       block_indices_begins,
+                                       block_indices,
+                                       subsequence_begins,
+                                       past_lens,
+                                       value_cache_0,
+                                       key_cache_0,
+                                       input_ids,
+                                       position_ids});
+
+        // Inputs pre-processing:
+        auto max_context_len_i64 = makeOP<v0::Convert>({max_context_len}, {dest_type_i64});
+        auto max_context_len_aligned = makeOP<v1::Reshape>({max_context_len_i64, {1}}, {special_zero_true});
+        auto input_ids_aligned = makeOP<v0::Unsqueeze>({input_ids, 1});
+        auto position_ids_aligned = makeOP<v0::Unsqueeze>({position_ids, 1});
+
+        // Embeddings processing:
+        auto embeddings = Qwen7bChatPA::gen_embeddings(input_ids_aligned);
+        auto qkv_proj = Qwen7bChatPA::gen_qkv_proj(embeddings);
+
+        // RoPE emb sin/cos init:
+        auto head_size = shared_ptr<Node>();
+        auto rope_emb_sin = Qwen7bChatPA::gen_rope_emb_sin(max_context_len_aligned, position_ids_aligned, head_size);
+        auto rope_emb_cos = Qwen7bChatPA::gen_rope_emb_cos(max_context_len_aligned, position_ids_aligned);
+
+        // rope Q, K:
+        auto rope_Q = Qwen7bChatPA::gen_rope(QKV::Q, qkv_proj, head_size, rope_emb_sin, rope_emb_cos);
+        auto rope_K = Qwen7bChatPA::gen_rope(QKV::K, qkv_proj, head_size, rope_emb_sin, rope_emb_cos);
+
+        // Current/past/total Seq lengths calculation:
+        auto current_seq_len = Qwen7bChatPA::gen_current_len(rope_K);
+        auto past_seq_len = Qwen7bChatPA::gen_past_len(input_ids_aligned, max_context_len);
+        auto total_seq_len = Qwen7bChatPA::gen_total_len(current_seq_len, past_seq_len);
+
+        // Q, K, V:
+        shared_ptr<Node> head_size_2;
+        auto Q = Qwen7bChatPA::gen_Q(total_seq_len, rope_Q);
+        auto K = Qwen7bChatPA::gen_K(rope_K);
+        auto V = Qwen7bChatPA::gen_V(qkv_proj, head_size_2);
+
+        // Additional PA arguments:
+        auto sliding_window = std::make_shared<v0::Constant>(element::i32, Shape{}, 0);
+        auto alibi_slopes = std::make_shared<v0::Constant>(element::f32, Shape{0});
+        auto scale = std::make_shared<v0::Constant>(element::f32, Shape{}, MOCK_VALUE);
+
+        // PagedAttention:
+        auto pa = std::make_shared<op::PagedAttentionExtension>(OutputVector{Q,
+                                                                             K,
+                                                                             V,
+                                                                             key_cache_0,
+                                                                             value_cache_0,
+                                                                             past_lens,
+                                                                             subsequence_begins,
+                                                                             block_indices,
+                                                                             block_indices_begins,
+                                                                             scale,
+                                                                             sliding_window,
+                                                                             alibi_slopes,
+                                                                             max_context_len});
+        pa->set_out_type(0, element::i64);
+        auto pa_aligned = Qwen7bChatPA::align_pa_layout(pa, head_size_2);
+        auto res = makeOP<v0::Result>({pa_aligned});
+
+        model_ref = std::make_shared<ov::Model>(OutputVector{res}, params);
+    }
+    // TODO: align precisions, check the copying of "fuse_names" attr in SDPAToPagedAttention
+    // checking the graph structure and names, other checks are temporarily disabled:
+    comparator.disable(FunctionsComparator::PRECISIONS);
+    disable_rt_info_check();
+}
+
+TEST_F(TransformationTestsF, SDPAToPA_TotalSequenceLengthPatternQwen) {
+    {
+        // Inputs to SDPA transformer:
+        auto beam_idx = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN}}, el_type_i64});
+        auto input_ids = makeOP<v0::Parameter>({}, {{"shape", PartialShape{DYN, DYN}}, el_type_i64});
+        ParameterVector params = nodes_to_params({input_ids, beam_idx});
+
+        // K cache
+        auto k_cache = Qwen7bChatSDPA::gen_cache(input_ids, beam_idx, "K_cache");
+
+        // Current/past/total Seq lengths calculation:
+        auto current_len = Qwen7bChatSDPA::gen_current_len(input_ids);
+        auto past_len = Qwen7bChatSDPA::gen_past_len(k_cache);
+        auto total_len = Qwen7bChatSDPA::gen_total_len(current_len, past_len);
+        auto result = std::make_shared<v0::Result>(total_len);
+
+        // Expected that these Nodes to be created inside SDPAToPagedAttention
+        auto new_input_ids = std::make_shared<v0::Parameter>(element::i64, PartialShape{DYN});
+        auto axis = v0::Constant::create(element::i32, Shape{}, {1});
+        auto aligned_input_ids = std::make_shared<v0::Unsqueeze>(new_input_ids, axis);
+
+        input_ids->output(0).replace(aligned_input_ids);
+        auto max_context_len = std::make_shared<v0::Parameter>(element::i32, PartialShape{});
+        max_context_len->output(0).set_names({"max_context_len"});
+        auto position_ids = std::make_shared<v0::Parameter>(element::i64, PartialShape{DYN});
+        position_ids->output(0).set_names({"position_ids"});
+
+        params.push_back(max_context_len);
+        params.push_back(new_input_ids);
+
+        // Model and Transformations:
+        model = std::make_shared<ov::Model>(ResultVector{result}, params);
+        manager.register_pass<pass::PrevSequenceLengthPattern>(aligned_input_ids, max_context_len, position_ids);
+        manager.register_pass<pass::TotalSequenceLengthPatternQwen>(max_context_len);
+    }
+
+    {
+        // Inputs to PA transformer:
+        auto max_context_len = makeOP<v0::Parameter>({}, {{"shape", PartialShape{}}, el_type_i32});
+        auto params = nodes_to_params({max_context_len});
+
+        // Inputs pre-processing:
+        auto max_context_len_i64 = makeOP<v0::Convert>({max_context_len}, {dest_type_i64});
+        auto max_context_len_aligned = makeOP<v1::Reshape>({max_context_len_i64, {1}}, {special_zero_true});
+
+        auto result = std::make_shared<v0::Result>(max_context_len_aligned);
+        model_ref = std::make_shared<ov::Model>(ResultVector{result}, params);
+    }
+    // TODO: align precisions, check the copying of "fuse_names" attr in SDPAToPagedAttention
+    // checking the graph structure and names, other checks are temporarily disabled:
+    comparator.disable(FunctionsComparator::PRECISIONS);
+    disable_result_friendly_names_check();
+    disable_rt_info_check();
+}
diff --git a/src/core/include/openvino/op/fake_convert.hpp b/src/core/include/openvino/op/fake_convert.hpp
index c3eaa43b98a51b..16ef7a0337c15b 100644
--- a/src/core/include/openvino/op/fake_convert.hpp
+++ b/src/core/include/openvino/op/fake_convert.hpp
@@ -68,6 +68,7 @@ class OPENVINO_API FakeConvert : public Op {
     bool has_evaluate() const override;
 
     std::string get_destination_type() const;
+    void set_destination_type(ov::element::Type destination_type);
     const ov::element::Type& get_destination_element_type() const;
 
 private:
diff --git a/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp b/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp
index 74aeacb0719cee..d52e78dbd6a489 100644
--- a/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp
+++ b/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp
@@ -19,7 +19,7 @@ class OPENVINO_API SDPAToPagedAttention : public ModelPass {
 public:
     OPENVINO_MODEL_PASS_RTTI("SDPAToPagedAttention");
 
-    SDPAToPagedAttention(bool use_block_indices_inputs = false, bool use_score_outputs = false);
+    explicit SDPAToPagedAttention(bool use_block_indices_inputs = false, bool use_score_outputs = false);
     bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
 
 private:
diff --git a/src/core/src/op/fake_convert.cpp b/src/core/src/op/fake_convert.cpp
index 5b3c8f8d8e9938..517674402ef872 100644
--- a/src/core/src/op/fake_convert.cpp
+++ b/src/core/src/op/fake_convert.cpp
@@ -79,6 +79,10 @@ std::string FakeConvert::get_destination_type() const {
     return m_destination_type.get_type_name();
 }
 
+void FakeConvert::set_destination_type(ov::element::Type destination_type) {
+    m_destination_type = destination_type;
+}
+
 const ov::element::Type& FakeConvert::get_destination_element_type() const {
     return m_destination_type;
 }
diff --git a/src/core/src/pass/manager.cpp b/src/core/src/pass/manager.cpp
index a6f1fc287e221c..b084ec4dc38e09 100644
--- a/src/core/src/pass/manager.cpp
+++ b/src/core/src/pass/manager.cpp
@@ -5,6 +5,7 @@
 #include "openvino/pass/manager.hpp"
 
 #include <algorithm>
+#include <chrono>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
diff --git a/src/core/src/pass/sdpa_to_paged_attention.cpp b/src/core/src/pass/sdpa_to_paged_attention.cpp
index 872e4539eda8df..e6fc744bb5ef4f 100644
--- a/src/core/src/pass/sdpa_to_paged_attention.cpp
+++ b/src/core/src/pass/sdpa_to_paged_attention.cpp
@@ -81,15 +81,12 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptr<ov::Mode
     OPENVINO_ASSERT(input_ids_node, "The model doesn't contain input_ids or input_embeds input. Aborting.");
 
     input_ids_node->set_partial_shape(PartialShape{-1});
+    auto input_ids_target_inputs = input_ids_node->get_output_target_inputs(0);
     auto unsqueezed_input_ids =
         std::make_shared<v0::Unsqueeze>(input_ids_node, v0::Constant::create(element::i32, Shape{}, {1}));
-    replace_node(input_ids_node, unsqueezed_input_ids);
-
-    auto cur_seq_len = std::make_shared<v8::Gather>(std::make_shared<v3::ShapeOf>(unsqueezed_input_ids),
-                                                    v0::Constant::create(element::i64, Shape{}, {1}),
-                                                    v0::Constant::create(element::i64, Shape{}, {0}));
-    auto prev_max_seq_len =
-        std::make_shared<v1::Subtract>(max_context_len, std::make_shared<v0::Convert>(cur_seq_len, element::i32));
+    for (const auto& target : input_ids_target_inputs) {
+        target.replace_source_output(unsqueezed_input_ids);
+    }
 
     ParameterVector kv_parameters;
     ParameterVector parameters_to_remove;
@@ -106,15 +103,15 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptr<ov::Mode
         position_ids->set_partial_shape(PartialShape{-1});
         position_ids->validate_and_infer_types();
     }
+    auto position_ids_target_inputs = position_ids->get_output_target_inputs(0);
     auto unsqueezed_position_ids =
         std::make_shared<v0::Unsqueeze>(position_ids, v0::Constant::create(element::i32, Shape{}, {1}));
-    replace_node(position_ids, unsqueezed_position_ids);
+    for (const auto& target : position_ids_target_inputs) {
+        target.replace_source_output(unsqueezed_position_ids);
+    }
 
     int layer_index = 0;
 
-    auto batch_dim =
-        std::make_shared<v3::ShapeOf>(position_ids);  // it is not always required, so will be disposed if not needed
-
     ov::pass::Manager manager("SDPA to PA");
     manager.set_per_pass_validation(false);
     manager.register_pass<StateManagementPattern>(kv_parameters,
@@ -127,9 +124,12 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptr<ov::Mode
                                                   score_results,
                                                   m_use_block_indices_inputs,
                                                   m_use_score_outputs);
-    manager.register_pass<PrevSequenceLengthPattern>(prev_max_seq_len, batch_dim);
+
+    manager.register_pass<PrevSequenceLengthPattern>(unsqueezed_input_ids, max_context_len, position_ids);
     manager.register_pass<TotalSequenceLengthPattern>(max_context_len);
-    manager.register_pass<PositionIDsReplacer>(unsqueezed_position_ids->output(0));
+    manager.register_pass<TotalSequenceLengthPatternQwen>(max_context_len);
+    manager.register_pass<PositionIDsReplacer>(unsqueezed_position_ids);
+    manager.register_pass<PositionIDsReplacerQwen>(unsqueezed_position_ids);
     manager.run_passes(model);
 
     {
diff --git a/src/frontends/onnx/tests/__init__.py b/src/frontends/onnx/tests/__init__.py
index ef8cebfa361e3f..fdf1295dfd1dbe 100644
--- a/src/frontends/onnx/tests/__init__.py
+++ b/src/frontends/onnx/tests/__init__.py
@@ -147,7 +147,7 @@ def xfail_test(reason="Mark the test as expected to fail", strict=True):
 skip_dynamic_model = pytest.mark.skip(reason="CPU plug-in can't load a model with dynamic output shapes via legacy API")
 
 # ONNX 1.14
-xfail_issue_119896 = xfail_test(reason="Unsupported element type: FLOAT8")
+xfail_issue_119896 = xfail_test(reason="Unsupported element type: FLOAT8", strict=False)
 xfail_issue_119900 = xfail_test(reason="While validating ONNX node '<Node(Resize): Y>': "
                                        "half_pixel_symmetric - this type of coordinate transformation mode "
                                        "is not supported. Choose one of the following modes: "
diff --git a/src/inference/src/os/lin/lin_system_conf.cpp b/src/inference/src/os/lin/lin_system_conf.cpp
index f8bd16173b8fce..29c8bfddbd1ca4 100644
--- a/src/inference/src/os/lin/lin_system_conf.cpp
+++ b/src/inference/src/os/lin/lin_system_conf.cpp
@@ -23,76 +23,107 @@ CPU::CPU() {
     std::vector<std::vector<std::string>> system_info_table;
     std::vector<std::string> node_info_table;
 
-    auto get_cache_info_linux = [&]() {
+    constexpr int cache_info_mode = 1;
+    constexpr int freq_info_mode = 2;
+
+    auto get_info_linux = [&](int mode) {
         int cpu_index = 0;
-        int cache_index = 0;
-        int cache_files = 3;
+        int file_index = 0;
+        int max_files = 3;
 
-        std::vector<std::string> one_info(cache_files);
+        std::string one_info;
 
-        while (1) {
-            for (int n = 0; n < cache_files; n++) {
-                cache_index = (n == 0) ? n : n + 1;
-
-                std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + "/cache/index" +
-                                         std::to_string(cache_index) + "/shared_cpu_list");
-                if (!cache_file.is_open()) {
-                    cache_index = -1;
-                    break;
-                }
-                std::string cache_info;
-                std::getline(cache_file, cache_info);
-                one_info[n] = std::move(cache_info);
-            }
+        std::string::size_type pos = 0;
+        std::string::size_type endpos = 0;
+        std::string sub_str;
 
-            if (cache_index == -1) {
-                if (cpu_index == 0) {
-                    return -1;
-                } else {
-                    return 0;
-                }
-            } else {
-                system_info_table.push_back(one_info);
-                cpu_index++;
-            }
+        int core_1;
+        int core_2;
+
+        system_info_table.clear();
+
+        std::ifstream possible_file("/sys/devices/system/cpu/possible");
+        std::string possible_info;
+
+        if (possible_file.is_open()) {
+            std::getline(possible_file, possible_info);
+        } else {
+            return -1;
         }
 
-        return 0;
-    };
+        if ((endpos = possible_info.find('-', pos)) != std::string::npos) {
+            sub_str = possible_info.substr(pos, endpos - pos);
+            core_1 = std::stoi(sub_str);
+            sub_str = possible_info.substr(endpos + 1);
+            core_2 = std::stoi(sub_str);
+            system_info_table.resize(core_2 + 1, std::vector<std::string>(max_files, ""));
+        } else {
+            return -1;
+        }
 
-    auto get_freq_info_linux = [&]() {
-        int cpu_index = 0;
-        int cache_index = 0;
+        std::ifstream online_file("/sys/devices/system/cpu/online");
+        std::string online_info;
 
-        std::vector<std::string> file_name = {"/topology/core_cpus_list",
-                                              "/topology/physical_package_id",
-                                              "/cpufreq/cpuinfo_max_freq"};
-        int num_of_files = file_name.size();
-        std::vector<std::string> one_info(num_of_files);
+        if (online_file.is_open()) {
+            std::getline(online_file, online_info);
+        } else {
+            system_info_table.clear();
+            return -1;
+        }
 
         while (1) {
-            for (int n = 0; n < num_of_files; n++) {
-                cache_index = n;
+            if ((endpos = online_info.find('-', pos)) != std::string::npos) {
+                sub_str = online_info.substr(pos, endpos - pos);
+                core_1 = std::stoi(sub_str);
+                sub_str = online_info.substr(endpos + 1);
+                core_2 = std::stoi(sub_str);
 
-                std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + file_name[n]);
-                if (!cache_file.is_open()) {
-                    cache_index = -1;
-                    break;
+                for (cpu_index = core_1; cpu_index <= core_2; cpu_index++) {
+                    if (mode == cache_info_mode) {
+                        for (int n = 0; n < max_files; n++) {
+                            file_index = (n == 0) ? n : n + 1;
+                            one_info.clear();
+
+                            std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) +
+                                                     "/cache/index" + std::to_string(file_index) + "/shared_cpu_list");
+                            if (cache_file.is_open()) {
+                                std::getline(cache_file, one_info);
+                            } else {
+                                if ((cpu_index == core_1) && (n == 0)) {
+                                    system_info_table.clear();
+                                    return -1;
+                                }
+                            }
+                            system_info_table[cpu_index][n] = std::move(one_info);
+                        }
+                    } else {
+                        std::vector<std::string> file_name = {"/topology/core_cpus_list",
+                                                              "/topology/physical_package_id",
+                                                              "/cpufreq/cpuinfo_max_freq"};
+
+                        for (int n = 0; n < max_files; n++) {
+                            one_info.clear();
+
+                            std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) +
+                                                     file_name[n]);
+                            if (cache_file.is_open()) {
+                                std::getline(cache_file, one_info);
+                            } else {
+                                if ((cpu_index == core_1) && (n == 2)) {
+                                    system_info_table.clear();
+                                    return -1;
+                                }
+                            }
+                            system_info_table[cpu_index][n] = std::move(one_info);
+                        }
+                    }
                 }
-                std::string cache_info;
-                std::getline(cache_file, cache_info);
-                one_info[n] = std::move(cache_info);
             }
 
-            if (cache_index == -1) {
-                if (cpu_index == 0) {
-                    return -1;
-                } else {
-                    return 0;
-                }
+            if ((pos = online_info.find(',', endpos)) != std::string::npos) {
+                pos++;
             } else {
-                system_info_table.push_back(one_info);
-                cpu_index++;
+                break;
             }
         }
 
@@ -190,20 +221,23 @@ CPU::CPU() {
         } else {
             _processors = valid_cpu_mapping_table.size();
             _cpu_mapping_table.swap(valid_cpu_mapping_table);
-            update_valid_processor_linux(std::move(phy_core_list),
-                                         _numa_nodes,
-                                         _cores,
-                                         _proc_type_table,
-                                         _cpu_mapping_table);
+            {
+                std::lock_guard<std::mutex> lock{_cpu_mutex};
+                update_valid_processor_linux(std::move(phy_core_list),
+                                             _numa_nodes,
+                                             _cores,
+                                             _proc_type_table,
+                                             _cpu_mapping_table);
+            }
             return 0;
         }
     };
 
     get_node_info_linux();
 
-    if (!get_cache_info_linux()) {
+    if (!get_info_linux(cache_info_mode)) {
         parse_cache_info_linux(system_info_table,
-                               node_info_table,
+                               std::move(node_info_table),
                                _processors,
                                _numa_nodes,
                                _sockets,
@@ -215,9 +249,9 @@ CPU::CPU() {
     if ((_proc_type_table.size() == 0) ||
         ((_proc_type_table[0][MAIN_CORE_PROC] == 0) && (_proc_type_table[0][ALL_PROC] > 0) &&
          (_proc_type_table[0][ALL_PROC] != _proc_type_table[0][EFFICIENT_CORE_PROC]))) {
-        if (!get_freq_info_linux()) {
+        if (!get_info_linux(freq_info_mode)) {
             parse_freq_info_linux(system_info_table,
-                                  node_info_table,
+                                  std::move(node_info_table),
                                   _processors,
                                   _numa_nodes,
                                   _sockets,
@@ -471,56 +505,73 @@ void parse_cache_info_linux(const std::vector<std::vector<std::string>> system_i
 
     const std::vector<int> line_value_0({0, 0, 0, 0, -1, -1});
 
-    for (int n = 0; n < _processors; n++) {
-        if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) {
-            std::string::size_type pos = 0;
-            std::string::size_type endpos = 0;
-            std::string sub_str;
-
-            int core_1;
-            int core_2;
+    std::vector<int> offline_list;
+    int info_index = 0;
 
-            if (0 == _sockets) {
-                _proc_type_table.push_back(line_value_0);
-            } else {
-                _proc_type_table.push_back(_proc_type_table[0]);
-                _proc_type_table[0] = line_value_0;
-            }
-
-            while (1) {
-                if ((endpos = system_info_table[n][2].find('-', pos)) != std::string::npos) {
-                    sub_str = system_info_table[n][2].substr(pos, endpos - pos);
-                    core_1 = std::stoi(sub_str);
-                    sub_str = system_info_table[n][2].substr(endpos + 1);
-                    core_2 = std::stoi(sub_str);
+    for (int n = 0; n < _processors; n++) {
+        if ((system_info_table[n][2].size() > 0) || (system_info_table[n][1].size() > 0)) {
+            info_index = system_info_table[n][2].size() > 0 ? 2 : 1;
+            if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) {
+                std::string::size_type pos = 0;
+                std::string::size_type endpos = 0;
+                std::string sub_str;
+
+                int core_1;
+                int core_2;
+
+                if (0 == _sockets) {
+                    _proc_type_table.push_back(line_value_0);
+                } else {
+                    _proc_type_table.push_back(_proc_type_table[0]);
+                    _proc_type_table[0] = line_value_0;
+                }
 
-                    for (int m = core_1; m <= core_2; m++) {
-                        _cpu_mapping_table[m][CPU_MAP_SOCKET_ID] = _sockets;
-                        _cpu_mapping_table[m][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[m][CPU_MAP_SOCKET_ID];
-                        update_proc_map_info(m);
+                while (1) {
+                    if ((endpos = system_info_table[n][info_index].find('-', pos)) != std::string::npos) {
+                        sub_str = system_info_table[n][info_index].substr(pos, endpos - pos);
+                        core_1 = std::stoi(sub_str);
+                        sub_str = system_info_table[n][info_index].substr(endpos + 1);
+                        core_2 = std::stoi(sub_str);
+
+                        if ((info_index == 1) && (core_2 - core_1 == 1)) {
+                            offline_list.push_back(n);
+                            break;
+                        }
+                        for (int m = core_1; m <= core_2; m++) {
+                            _cpu_mapping_table[m][CPU_MAP_SOCKET_ID] = _sockets;
+                            _cpu_mapping_table[m][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[m][CPU_MAP_SOCKET_ID];
+                            update_proc_map_info(m);
+                            if (_processors == 0) {
+                                return;
+                            };
+                        }
+                    } else if (pos != std::string::npos) {
+                        sub_str = system_info_table[n][info_index].substr(pos);
+                        core_1 = std::stoi(sub_str);
+                        _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = _sockets;
+                        _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] =
+                            _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
+                        update_proc_map_info(core_1);
                         if (_processors == 0) {
                             return;
                         };
+                        endpos = pos;
                     }
-                } else if (pos != std::string::npos) {
-                    sub_str = system_info_table[n][2].substr(pos);
-                    core_1 = std::stoi(sub_str);
-                    _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = _sockets;
-                    _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
-                    update_proc_map_info(core_1);
-                    if (_processors == 0) {
-                        return;
-                    };
-                    endpos = pos;
-                }
 
-                if ((pos = system_info_table[n][2].find(',', endpos)) != std::string::npos) {
-                    pos++;
-                } else {
-                    break;
+                    if ((pos = system_info_table[n][2].find(',', endpos)) != std::string::npos) {
+                        pos++;
+                    } else {
+                        break;
+                    }
+                }
+                _sockets++;
+                if (_proc_type_table[0][ALL_PROC] == 0) {
+                    _proc_type_table.erase(_proc_type_table.begin());
+                    _sockets--;
                 }
             }
-            _sockets++;
+        } else {
+            offline_list.push_back(n);
         }
     }
 
@@ -540,6 +591,11 @@ void parse_cache_info_linux(const std::vector<std::vector<std::string>> system_i
         _numa_nodes = node_info_table.size();
         parse_node_info_linux(node_info_table, _numa_nodes, _sockets, _proc_type_table, _cpu_mapping_table);
     }
+
+    for (size_t n = 0; n < offline_list.size(); n++) {
+        _cpu_mapping_table.erase(_cpu_mapping_table.begin() + offline_list[n] - n);
+        _processors--;
+    }
 };
 
 void get_cpu_mapping_from_cores(const int _processors,
@@ -615,7 +671,6 @@ void parse_freq_info_linux(const std::vector<std::vector<std::string>> system_in
                            std::vector<std::vector<int>>& _cpu_mapping_table) {
     int freq_max = 0;
     bool ecore_enabled = false;
-    bool ht_enabled = false;
 
     _processors = system_info_table.size();
     _numa_nodes = 0;
@@ -625,6 +680,8 @@ void parse_freq_info_linux(const std::vector<std::vector<std::string>> system_in
 
     std::vector<int> line_value_0(PROC_TYPE_TABLE_SIZE, 0);
 
+    std::vector<int> offline_list;
+
     auto clean_up_output = [&]() {
         _processors = 0;
         _cores = 0;
@@ -636,65 +693,68 @@ void parse_freq_info_linux(const std::vector<std::vector<std::string>> system_in
     };
 
     for (int n = 0; n < _processors; n++) {
-        if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) {
-            std::string::size_type pos = 0;
-            std::string::size_type endpos1 = 0;
-            std::string::size_type endpos2 = 0;
-            std::string sub_str;
-
-            int core_1 = 0;
-            int core_2 = 0;
-
-            if (((endpos1 = system_info_table[n][0].find(',', pos)) != std::string::npos) ||
-                ((endpos2 = system_info_table[n][0].find('-', pos)) != std::string::npos)) {
-                endpos1 = (endpos1 != std::string::npos) ? endpos1 : endpos2;
-                sub_str = system_info_table[n][0].substr(pos, endpos1 - pos);
-                core_1 = std::stoi(sub_str);
-                sub_str = system_info_table[n][0].substr(endpos1 + 1);
-                core_2 = std::stoi(sub_str);
-                if ((core_1 != n) && (core_2 != n)) {
-                    clean_up_output();
-                    return;
-                }
-
-                _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1;
-                _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]);
-                _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
-                _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores;
-                _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = HYPER_THREADING_PROC;
-                _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores;
+        if (system_info_table[n][2].size() > 0) {
+            if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) {
+                std::string::size_type pos = 0;
+                std::string::size_type endpos1 = 0;
+                std::string::size_type endpos2 = 0;
+                std::string sub_str;
+
+                int core_1 = 0;
+                int core_2 = 0;
+
+                if (((endpos1 = system_info_table[n][0].find(',', pos)) != std::string::npos) ||
+                    ((endpos2 = system_info_table[n][0].find('-', pos)) != std::string::npos)) {
+                    endpos1 = (endpos1 != std::string::npos) ? endpos1 : endpos2;
+                    sub_str = system_info_table[n][0].substr(pos, endpos1 - pos);
+                    core_1 = std::stoi(sub_str);
+                    sub_str = system_info_table[n][0].substr(endpos1 + 1);
+                    core_2 = std::stoi(sub_str);
+                    if ((core_1 != n) && (core_2 != n)) {
+                        clean_up_output();
+                        return;
+                    }
 
-                _cpu_mapping_table[core_2][CPU_MAP_PROCESSOR_ID] = core_2;
-                _cpu_mapping_table[core_2][CPU_MAP_SOCKET_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
-                _cpu_mapping_table[core_2][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
-                _cpu_mapping_table[core_2][CPU_MAP_CORE_ID] = _cpu_mapping_table[core_1][CPU_MAP_CORE_ID];
-                _cpu_mapping_table[core_2][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC;
-                _cpu_mapping_table[core_2][CPU_MAP_GROUP_ID] = _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID];
+                    _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1;
+                    _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]);
+                    _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
+                    _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores;
+                    _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = HYPER_THREADING_PROC;
+                    _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores;
+
+                    _cpu_mapping_table[core_2][CPU_MAP_PROCESSOR_ID] = core_2;
+                    _cpu_mapping_table[core_2][CPU_MAP_SOCKET_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
+                    _cpu_mapping_table[core_2][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
+                    _cpu_mapping_table[core_2][CPU_MAP_CORE_ID] = _cpu_mapping_table[core_1][CPU_MAP_CORE_ID];
+                    _cpu_mapping_table[core_2][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC;
+                    _cpu_mapping_table[core_2][CPU_MAP_GROUP_ID] = _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID];
+
+                    int core_freq = std::stoi(system_info_table[core_1][2]);
+                    freq_max = std::max(core_freq, freq_max);
+                } else if (system_info_table[n][0].size() > 0) {
+                    core_1 = std::stoi(system_info_table[n][0]);
 
-                ht_enabled = true;
-                int core_freq = std::stoi(system_info_table[core_1][2]);
-                freq_max = std::max(core_freq, freq_max);
-            } else if (system_info_table[n][0].size() > 0) {
-                core_1 = std::stoi(system_info_table[n][0]);
+                    _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1;
+                    _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]);
+                    _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
+                    _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores;
 
-                _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1;
-                _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]);
-                _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID];
-                _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores;
+                    int core_freq = std::stoi(system_info_table[core_1][2]);
+                    if ((0 == freq_max) || (core_freq >= freq_max * 0.97)) {
+                        freq_max = std::max(core_freq, freq_max);
+                        _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC;
+                    } else {
+                        _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = EFFICIENT_CORE_PROC;
+                        ecore_enabled = true;
+                    }
 
-                int core_freq = std::stoi(system_info_table[core_1][2]);
-                if (((0 == freq_max) || (core_freq >= freq_max * 0.95)) && (!ht_enabled)) {
-                    freq_max = std::max(core_freq, freq_max);
-                    _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC;
-                } else {
-                    _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = EFFICIENT_CORE_PROC;
-                    ecore_enabled = true;
+                    _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores;
                 }
-
-                _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores;
+                _sockets = std::max(_sockets, _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]);
+                _cores++;
             }
-            _sockets = std::max(_sockets, _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]);
-            _cores++;
+        } else {
+            offline_list.push_back(n);
         }
     }
 
@@ -733,6 +793,11 @@ void parse_freq_info_linux(const std::vector<std::vector<std::string>> system_in
         _numa_nodes = node_info_table.size();
         parse_node_info_linux(node_info_table, _numa_nodes, _sockets, _proc_type_table, _cpu_mapping_table);
     }
+
+    for (size_t n = 0; n < offline_list.size(); n++) {
+        _cpu_mapping_table.erase(_cpu_mapping_table.begin() + offline_list[n] - n);
+        _processors--;
+    }
 };
 
 void update_valid_processor_linux(const std::vector<int> phy_core_list,
diff --git a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp
index 8679090b9ae491..9ea43bd0604296 100644
--- a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp
+++ b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp
@@ -385,6 +385,188 @@ LinuxCpuMapTestCase cache_1sockets_96cores = {
         {"0-95"},
     },
 };
+LinuxCpuMapTestCase cache_2sockets_56cores_hyperthreading = {
+    110,
+    2,
+    2,
+    56,
+    {{110, 56, 0, 54, -1, -1}, {54, 28, 0, 26, 0, 0}, {56, 28, 0, 28, 1, 1}},
+    {
+        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},    {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1},
+        {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1},    {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1},
+        {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1},    {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1},
+        {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1},    {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1},
+        {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1},    {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1},
+        {11, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {12, 0, 0, 11, HYPER_THREADING_PROC, 11, -1},
+        {13, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {14, 0, 0, 13, HYPER_THREADING_PROC, 13, -1},
+        {15, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {16, 0, 0, 15, HYPER_THREADING_PROC, 15, -1},
+        {17, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {18, 0, 0, 17, HYPER_THREADING_PROC, 17, -1},
+        {19, 0, 0, 18, HYPER_THREADING_PROC, 18, -1}, {21, 0, 0, 19, HYPER_THREADING_PROC, 19, -1},
+        {22, 0, 0, 20, HYPER_THREADING_PROC, 20, -1}, {23, 0, 0, 21, HYPER_THREADING_PROC, 21, -1},
+        {24, 0, 0, 22, HYPER_THREADING_PROC, 22, -1}, {25, 0, 0, 23, HYPER_THREADING_PROC, 23, -1},
+        {26, 0, 0, 24, HYPER_THREADING_PROC, 24, -1}, {27, 0, 0, 25, HYPER_THREADING_PROC, 25, -1},
+        {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1},
+        {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1},
+        {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1},
+        {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1},
+        {36, 1, 1, 36, HYPER_THREADING_PROC, 36, -1}, {37, 1, 1, 37, HYPER_THREADING_PROC, 37, -1},
+        {38, 1, 1, 38, HYPER_THREADING_PROC, 38, -1}, {39, 1, 1, 39, HYPER_THREADING_PROC, 39, -1},
+        {40, 1, 1, 40, HYPER_THREADING_PROC, 40, -1}, {41, 1, 1, 41, HYPER_THREADING_PROC, 41, -1},
+        {42, 1, 1, 42, HYPER_THREADING_PROC, 42, -1}, {43, 1, 1, 43, HYPER_THREADING_PROC, 43, -1},
+        {44, 1, 1, 44, HYPER_THREADING_PROC, 44, -1}, {45, 1, 1, 45, HYPER_THREADING_PROC, 45, -1},
+        {46, 1, 1, 46, HYPER_THREADING_PROC, 46, -1}, {47, 1, 1, 47, HYPER_THREADING_PROC, 47, -1},
+        {48, 1, 1, 48, HYPER_THREADING_PROC, 48, -1}, {49, 1, 1, 49, HYPER_THREADING_PROC, 49, -1},
+        {50, 1, 1, 50, HYPER_THREADING_PROC, 50, -1}, {51, 1, 1, 51, HYPER_THREADING_PROC, 51, -1},
+        {52, 1, 1, 52, HYPER_THREADING_PROC, 52, -1}, {53, 1, 1, 53, HYPER_THREADING_PROC, 53, -1},
+        {54, 1, 1, 54, HYPER_THREADING_PROC, 54, -1}, {55, 1, 1, 55, HYPER_THREADING_PROC, 55, -1},
+        {56, 0, 0, 0, MAIN_CORE_PROC, 0, -1},         {57, 0, 0, 1, MAIN_CORE_PROC, 1, -1},
+        {58, 0, 0, 2, MAIN_CORE_PROC, 2, -1},         {59, 0, 0, 3, MAIN_CORE_PROC, 3, -1},
+        {60, 0, 0, 4, MAIN_CORE_PROC, 4, -1},         {61, 0, 0, 5, MAIN_CORE_PROC, 5, -1},
+        {62, 0, 0, 6, MAIN_CORE_PROC, 6, -1},         {63, 0, 0, 7, MAIN_CORE_PROC, 7, -1},
+        {64, 0, 0, 8, MAIN_CORE_PROC, 8, -1},         {65, 0, 0, 9, MAIN_CORE_PROC, 9, -1},
+        {66, 0, 0, 26, MAIN_CORE_PROC, 26, -1},       {67, 0, 0, 10, MAIN_CORE_PROC, 10, -1},
+        {68, 0, 0, 11, MAIN_CORE_PROC, 11, -1},       {69, 0, 0, 12, MAIN_CORE_PROC, 12, -1},
+        {70, 0, 0, 13, MAIN_CORE_PROC, 13, -1},       {71, 0, 0, 14, MAIN_CORE_PROC, 14, -1},
+        {72, 0, 0, 15, MAIN_CORE_PROC, 15, -1},       {73, 0, 0, 16, MAIN_CORE_PROC, 16, -1},
+        {74, 0, 0, 17, MAIN_CORE_PROC, 17, -1},       {75, 0, 0, 18, MAIN_CORE_PROC, 18, -1},
+        {76, 0, 0, 27, MAIN_CORE_PROC, 27, -1},       {77, 0, 0, 19, MAIN_CORE_PROC, 19, -1},
+        {78, 0, 0, 20, MAIN_CORE_PROC, 20, -1},       {79, 0, 0, 21, MAIN_CORE_PROC, 21, -1},
+        {80, 0, 0, 22, MAIN_CORE_PROC, 22, -1},       {81, 0, 0, 23, MAIN_CORE_PROC, 23, -1},
+        {82, 0, 0, 24, MAIN_CORE_PROC, 24, -1},       {83, 0, 0, 25, MAIN_CORE_PROC, 25, -1},
+        {84, 1, 1, 28, MAIN_CORE_PROC, 28, -1},       {85, 1, 1, 29, MAIN_CORE_PROC, 29, -1},
+        {86, 1, 1, 30, MAIN_CORE_PROC, 30, -1},       {87, 1, 1, 31, MAIN_CORE_PROC, 31, -1},
+        {88, 1, 1, 32, MAIN_CORE_PROC, 32, -1},       {89, 1, 1, 33, MAIN_CORE_PROC, 33, -1},
+        {90, 1, 1, 34, MAIN_CORE_PROC, 34, -1},       {91, 1, 1, 35, MAIN_CORE_PROC, 35, -1},
+        {92, 1, 1, 36, MAIN_CORE_PROC, 36, -1},       {93, 1, 1, 37, MAIN_CORE_PROC, 37, -1},
+        {94, 1, 1, 38, MAIN_CORE_PROC, 38, -1},       {95, 1, 1, 39, MAIN_CORE_PROC, 39, -1},
+        {96, 1, 1, 40, MAIN_CORE_PROC, 40, -1},       {97, 1, 1, 41, MAIN_CORE_PROC, 41, -1},
+        {98, 1, 1, 42, MAIN_CORE_PROC, 42, -1},       {99, 1, 1, 43, MAIN_CORE_PROC, 43, -1},
+        {100, 1, 1, 44, MAIN_CORE_PROC, 44, -1},      {101, 1, 1, 45, MAIN_CORE_PROC, 45, -1},
+        {102, 1, 1, 46, MAIN_CORE_PROC, 46, -1},      {103, 1, 1, 47, MAIN_CORE_PROC, 47, -1},
+        {104, 1, 1, 48, MAIN_CORE_PROC, 48, -1},      {105, 1, 1, 49, MAIN_CORE_PROC, 49, -1},
+        {106, 1, 1, 50, MAIN_CORE_PROC, 50, -1},      {107, 1, 1, 51, MAIN_CORE_PROC, 51, -1},
+        {108, 1, 1, 52, MAIN_CORE_PROC, 52, -1},      {109, 1, 1, 53, MAIN_CORE_PROC, 53, -1},
+        {110, 1, 1, 54, MAIN_CORE_PROC, 54, -1},      {111, 1, 1, 55, MAIN_CORE_PROC, 55, -1},
+    },
+    {
+        {"0,56", "0,56", "0-9,11-19,21-27,56-83"},
+        {"1,57", "1,57", "0-9,11-19,21-27,56-83"},
+        {"2,58", "2,58", "0-9,11-19,21-27,56-83"},
+        {"3,59", "3,59", "0-9,11-19,21-27,56-83"},
+        {"4,60", "4,60", "0-9,11-19,21-27,56-83"},
+        {"5,61", "5,61", "0-9,11-19,21-27,56-83"},
+        {"6,62", "6,62", "0-9,11-19,21-27,56-83"},
+        {"7,63", "7,63", "0-9,11-19,21-27,56-83"},
+        {"8,64", "8,64", "0-9,11-19,21-27,56-83"},
+        {"9,65", "9,65", "0-9,11-19,21-27,56-83"},
+        {"", "", ""},
+        {"11,67", "11,67", "0-9,11-19,21-27,56-83"},
+        {"12,68", "12,68", "0-9,11-19,21-27,56-83"},
+        {"13,69", "13,69", "0-9,11-19,21-27,56-83"},
+        {"14,70", "14,70", "0-9,11-19,21-27,56-83"},
+        {"15,71", "15,71", "0-9,11-19,21-27,56-83"},
+        {"16,72", "16,72", "0-9,11-19,21-27,56-83"},
+        {"17,73", "17,73", "0-9,11-19,21-27,56-83"},
+        {"18,74", "18,74", "0-9,11-19,21-27,56-83"},
+        {"19,75", "19,75", "0-9,11-19,21-27,56-83"},
+        {"", "", ""},
+        {"21,77", "21,77", "0-9,11-19,21-27,56-83"},
+        {"22,78", "22,78", "0-9,11-19,21-27,56-83"},
+        {"23,79", "23,79", "0-9,11-19,21-27,56-83"},
+        {"24,80", "24,80", "0-9,11-19,21-27,56-83"},
+        {"25,81", "25,81", "0-9,11-19,21-27,56-83"},
+        {"26,82", "26,82", "0-9,11-19,21-27,56-83"},
+        {"27,83", "27,83", "0-9,11-19,21-27,56-83"},
+        {"28,84", "28,84", "28-55,84-111"},
+        {"29,85", "29,85", "28-55,84-111"},
+        {"30,86", "30,86", "28-55,84-111"},
+        {"31,87", "31,87", "28-55,84-111"},
+        {"32,88", "32,88", "28-55,84-111"},
+        {"33,89", "33,89", "28-55,84-111"},
+        {"34,90", "34,90", "28-55,84-111"},
+        {"35,91", "35,91", "28-55,84-111"},
+        {"36,92", "36,92", "28-55,84-111"},
+        {"37,93", "37,93", "28-55,84-111"},
+        {"38,94", "38,94", "28-55,84-111"},
+        {"39,95", "39,95", "28-55,84-111"},
+        {"40,96", "40,96", "28-55,84-111"},
+        {"41,97", "41,97", "28-55,84-111"},
+        {"42,98", "42,98", "28-55,84-111"},
+        {"43,99", "43,99", "28-55,84-111"},
+        {"44,100", "44,100", "28-55,84-111"},
+        {"45,101", "45,101", "28-55,84-111"},
+        {"46,102", "46,102", "28-55,84-111"},
+        {"47,103", "47,103", "28-55,84-111"},
+        {"48,104", "48,104", "28-55,84-111"},
+        {"49,105", "49,105", "28-55,84-111"},
+        {"50,106", "50,106", "28-55,84-111"},
+        {"51,107", "51,107", "28-55,84-111"},
+        {"52,108", "52,108", "28-55,84-111"},
+        {"53,109", "53,109", "28-55,84-111"},
+        {"54,110", "54,110", "28-55,84-111"},
+        {"55,111", "55,111", "28-55,84-111"},
+        {"0,56", "0,56", "0-9,11-19,21-27,56-83"},
+        {"1,57", "1,57", "0-9,11-19,21-27,56-83"},
+        {"2,58", "2,58", "0-9,11-19,21-27,56-83"},
+        {"3,59", "3,59", "0-9,11-19,21-27,56-83"},
+        {"4,60", "4,60", "0-9,11-19,21-27,56-83"},
+        {"5,61", "5,61", "0-9,11-19,21-27,56-83"},
+        {"6,62", "6,62", "0-9,11-19,21-27,56-83"},
+        {"7,63", "7,63", "0-9,11-19,21-27,56-83"},
+        {"8,64", "8,64", "0-9,11-19,21-27,56-83"},
+        {"9,65", "9,65", "0-9,11-19,21-27,56-83"},
+        {"66", "66", "0-9,11-19,21-27,56-83"},
+        {"11,67", "11,67", "0-9,11-19,21-27,56-83"},
+        {"12,68", "12,68", "0-9,11-19,21-27,56-83"},
+        {"13,69", "13,69", "0-9,11-19,21-27,56-83"},
+        {"14,70", "14,70", "0-9,11-19,21-27,56-83"},
+        {"15,71", "15,71", "0-9,11-19,21-27,56-83"},
+        {"16,72", "16,72", "0-9,11-19,21-27,56-83"},
+        {"17,73", "17,73", "0-9,11-19,21-27,56-83"},
+        {"18,74", "18,74", "0-9,11-19,21-27,56-83"},
+        {"19,75", "19,75", "0-9,11-19,21-27,56-83"},
+        {"76", "76", "0-9,11-19,21-27,56-83"},
+        {"21,77", "21,77", "0-9,11-19,21-27,56-83"},
+        {"22,78", "22,78", "0-9,11-19,21-27,56-83"},
+        {"23,79", "23,79", "0-9,11-19,21-27,56-83"},
+        {"24,80", "24,80", "0-9,11-19,21-27,56-83"},
+        {"25,81", "25,81", "0-9,11-19,21-27,56-83"},
+        {"26,82", "26,82", "0-9,11-19,21-27,56-83"},
+        {"27,83", "27,83", "0-9,11-19,21-27,56-83"},
+        {"28,84", "28,84", "28-55,84-111"},
+        {"29,85", "29,85", "28-55,84-111"},
+        {"30,86", "30,86", "28-55,84-111"},
+        {"31,87", "31,87", "28-55,84-111"},
+        {"32,88", "32,88", "28-55,84-111"},
+        {"33,89", "33,89", "28-55,84-111"},
+        {"34,90", "34,90", "28-55,84-111"},
+        {"35,91", "35,91", "28-55,84-111"},
+        {"36,92", "36,92", "28-55,84-111"},
+        {"37,93", "37,93", "28-55,84-111"},
+        {"38,94", "38,94", "28-55,84-111"},
+        {"39,95", "39,95", "28-55,84-111"},
+        {"40,96", "40,96", "28-55,84-111"},
+        {"41,97", "41,97", "28-55,84-111"},
+        {"42,98", "42,98", "28-55,84-111"},
+        {"43,99", "43,99", "28-55,84-111"},
+        {"44,100", "44,100", "28-55,84-111"},
+        {"45,101", "45,101", "28-55,84-111"},
+        {"46,102", "46,102", "28-55,84-111"},
+        {"47,103", "47,103", "28-55,84-111"},
+        {"48,104", "48,104", "28-55,84-111"},
+        {"49,105", "49,105", "28-55,84-111"},
+        {"50,106", "50,106", "28-55,84-111"},
+        {"51,107", "51,107", "28-55,84-111"},
+        {"52,108", "52,108", "28-55,84-111"},
+        {"53,109", "53,109", "28-55,84-111"},
+        {"54,110", "54,110", "28-55,84-111"},
+        {"55,111", "55,111", "28-55,84-111"},
+    },
+    {
+        {"0-9,11-19,21-27,56-83"},
+        {"28-55,84-111"},
+    },
+};
 LinuxCpuMapTestCase cache_2sockets_48cores_hyperthreading = {
     96,
     2,
@@ -1005,6 +1187,36 @@ LinuxCpuMapTestCase cache_2sockets_20cores_hyperthreading_1 = {
     },
     {},
 };
+LinuxCpuMapTestCase cache_1sockets_16cores_hyperthreading = {
+    20,
+    1,
+    1,
+    14,
+    {{20, 6, 8, 6, 0, 0}},
+    {
+        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},  {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1},
+        {2, 0, 0, 1, MAIN_CORE_PROC, 1, -1},        {3, 0, 0, 2, HYPER_THREADING_PROC, 2, -1},
+        {4, 0, 0, 2, MAIN_CORE_PROC, 2, -1},        {5, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
+        {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1},  {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1},
+        {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1},  {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1},
+        {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1},
+        {12, 0, 0, 6, EFFICIENT_CORE_PROC, 6, -1},  {13, 0, 0, 7, EFFICIENT_CORE_PROC, 6, -1},
+        {14, 0, 0, 8, EFFICIENT_CORE_PROC, 6, -1},  {15, 0, 0, 9, EFFICIENT_CORE_PROC, 6, -1},
+        {16, 0, 0, 10, EFFICIENT_CORE_PROC, 7, -1}, {17, 0, 0, 11, EFFICIENT_CORE_PROC, 7, -1},
+        {18, 0, 0, 12, EFFICIENT_CORE_PROC, 7, -1}, {19, 0, 0, 13, EFFICIENT_CORE_PROC, 7, -1},
+    },
+    {
+        {"0,5", "0,5", "0-19"},  {"1-2", "1-2", "0-19"},  {"1-2", "1-2", "0-19"},     {"3-4", "3-4", "0-19"},
+        {"3-4", "3-4", "0-19"},  {"0,5", "0,5", "0-19"},  {"6-7", "6-7", "0-19"},     {"6-7", "6-7", "0-19"},
+        {"8-9", "8-9", "0-19"},  {"8-9", "8-9", "0-19"},  {"10-11", "10-11", "0-19"}, {"10-11", "10-11", "0-19"},
+        {"12", "12-15", "0-19"}, {"13", "12-15", "0-19"}, {"14", "12-15", "0-19"},    {"15", "12-15", "0-19"},
+        {"16", "16-19", "0-19"}, {"17", "16-19", "0-19"}, {"18", "16-19", "0-19"},    {"19", "16-19", "0-19"},
+        {"20", "20-21", ""},     {"21", "20-21", ""},
+    },
+    {
+        {"0-21"},
+    },
+};
 LinuxCpuMapTestCase cache_1sockets_14cores_hyperthreading = {
     20,
     1,
@@ -1135,6 +1347,36 @@ LinuxCpuMapTestCase cache_1sockets_8cores_hyperthreading = {
     },
     {{"0-11"}},
 };
+LinuxCpuMapTestCase cache_1sockets_8cores_hyperthreading_1 = {
+    8,
+    1,
+    1,
+    8,
+    {{8, 4, 4, 0, 0, 0}},
+    {
+        {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
+        {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1},
+        {2, 0, 0, 2, MAIN_CORE_PROC, 2, -1},
+        {3, 0, 0, 3, MAIN_CORE_PROC, 3, -1},
+        {4, 0, 0, 4, EFFICIENT_CORE_PROC, 4, -1},
+        {5, 0, 0, 5, EFFICIENT_CORE_PROC, 4, -1},
+        {6, 0, 0, 6, EFFICIENT_CORE_PROC, 4, -1},
+        {7, 0, 0, 7, EFFICIENT_CORE_PROC, 4, -1},
+    },
+    {
+        {"0", "0", "0-3"},
+        {"1", "1", "0-3"},
+        {"2", "2", "0-3"},
+        {"3", "3", "0-3"},
+        {"4", "4-7", ""},
+        {"5", "4-7", ""},
+        {"6", "4-7", ""},
+        {"7", "4-7", ""},
+    },
+    {
+        {"0-7"},
+    },
+};
 LinuxCpuMapTestCase cache_1sockets_6cores_hyperthreading = {
     12,
     1,
@@ -1220,6 +1462,7 @@ INSTANTIATE_TEST_SUITE_P(CPUMap,
                          LinuxCpuMapCacheParserTests,
                          testing::Values(cache_2sockets_104cores_hyperthreading,
                                          cache_1sockets_96cores,
+                                         cache_2sockets_56cores_hyperthreading,
                                          cache_2sockets_48cores_hyperthreading,
                                          cache_2sockets_48cores_hyperthreading_1,
                                          cache_2sockets_24cores_hyperthreading,
@@ -1229,10 +1472,12 @@ INSTANTIATE_TEST_SUITE_P(CPUMap,
                                          cache_2sockets_48cores_2,
                                          cache_2sockets_20cores_hyperthreading,
                                          cache_2sockets_20cores_hyperthreading_1,
+                                         cache_1sockets_16cores_hyperthreading,
                                          cache_1sockets_14cores_hyperthreading,
                                          cache_1sockets_14cores_hyperthreading_1,
                                          cache_1sockets_10cores_hyperthreading,
                                          cache_1sockets_8cores_hyperthreading,
+                                         cache_1sockets_8cores_hyperthreading_1,
                                          cache_1sockets_6cores_hyperthreading,
                                          cache_1sockets_4cores,
                                          cache_VM_cache_0));
diff --git a/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp b/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp
index 04ab617961b953..8ccdfad011d19c 100644
--- a/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp
+++ b/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp
@@ -258,6 +258,188 @@ LinuxCpuMapTestCase freq_2sockets_112cores_hyperthreading = {
     },  // param[in]: The CPU frequency information table of this simulated platform
     {{"0-55,112-167"}, {"56-111,168-223"}},  // param[in]: The numa node information table of this simulated platform
 };
+LinuxCpuMapTestCase freq_2sockets_56cores_hyperthreading = {
+    110,
+    2,
+    2,
+    56,
+    {{110, 56, 0, 54, -1, -1}, {54, 28, 0, 26, 0, 0}, {56, 28, 0, 28, 1, 1}},
+    {
+        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},    {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1},
+        {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1},    {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1},
+        {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1},    {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1},
+        {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1},    {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1},
+        {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1},    {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1},
+        {11, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {12, 0, 0, 11, HYPER_THREADING_PROC, 11, -1},
+        {13, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {14, 0, 0, 13, HYPER_THREADING_PROC, 13, -1},
+        {15, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {16, 0, 0, 15, HYPER_THREADING_PROC, 15, -1},
+        {17, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {18, 0, 0, 17, HYPER_THREADING_PROC, 17, -1},
+        {19, 0, 0, 18, HYPER_THREADING_PROC, 18, -1}, {21, 0, 0, 19, HYPER_THREADING_PROC, 19, -1},
+        {22, 0, 0, 20, HYPER_THREADING_PROC, 20, -1}, {23, 0, 0, 21, HYPER_THREADING_PROC, 21, -1},
+        {24, 0, 0, 22, HYPER_THREADING_PROC, 22, -1}, {25, 0, 0, 23, HYPER_THREADING_PROC, 23, -1},
+        {26, 0, 0, 24, HYPER_THREADING_PROC, 24, -1}, {27, 0, 0, 25, HYPER_THREADING_PROC, 25, -1},
+        {28, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {29, 1, 1, 27, HYPER_THREADING_PROC, 27, -1},
+        {30, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {31, 1, 1, 29, HYPER_THREADING_PROC, 29, -1},
+        {32, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {33, 1, 1, 31, HYPER_THREADING_PROC, 31, -1},
+        {34, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {35, 1, 1, 33, HYPER_THREADING_PROC, 33, -1},
+        {36, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {37, 1, 1, 35, HYPER_THREADING_PROC, 35, -1},
+        {38, 1, 1, 36, HYPER_THREADING_PROC, 36, -1}, {39, 1, 1, 37, HYPER_THREADING_PROC, 37, -1},
+        {40, 1, 1, 38, HYPER_THREADING_PROC, 38, -1}, {41, 1, 1, 39, HYPER_THREADING_PROC, 39, -1},
+        {42, 1, 1, 40, HYPER_THREADING_PROC, 40, -1}, {43, 1, 1, 41, HYPER_THREADING_PROC, 41, -1},
+        {44, 1, 1, 42, HYPER_THREADING_PROC, 42, -1}, {45, 1, 1, 43, HYPER_THREADING_PROC, 43, -1},
+        {46, 1, 1, 44, HYPER_THREADING_PROC, 44, -1}, {47, 1, 1, 45, HYPER_THREADING_PROC, 45, -1},
+        {48, 1, 1, 46, HYPER_THREADING_PROC, 46, -1}, {49, 1, 1, 47, HYPER_THREADING_PROC, 47, -1},
+        {50, 1, 1, 48, HYPER_THREADING_PROC, 48, -1}, {51, 1, 1, 49, HYPER_THREADING_PROC, 49, -1},
+        {52, 1, 1, 50, HYPER_THREADING_PROC, 50, -1}, {53, 1, 1, 51, HYPER_THREADING_PROC, 51, -1},
+        {54, 1, 1, 52, HYPER_THREADING_PROC, 52, -1}, {55, 1, 1, 53, HYPER_THREADING_PROC, 53, -1},
+        {56, 0, 0, 0, MAIN_CORE_PROC, 0, -1},         {57, 0, 0, 1, MAIN_CORE_PROC, 1, -1},
+        {58, 0, 0, 2, MAIN_CORE_PROC, 2, -1},         {59, 0, 0, 3, MAIN_CORE_PROC, 3, -1},
+        {60, 0, 0, 4, MAIN_CORE_PROC, 4, -1},         {61, 0, 0, 5, MAIN_CORE_PROC, 5, -1},
+        {62, 0, 0, 6, MAIN_CORE_PROC, 6, -1},         {63, 0, 0, 7, MAIN_CORE_PROC, 7, -1},
+        {64, 0, 0, 8, MAIN_CORE_PROC, 8, -1},         {65, 0, 0, 9, MAIN_CORE_PROC, 9, -1},
+        {66, 0, 0, 54, MAIN_CORE_PROC, 54, -1},       {67, 0, 0, 10, MAIN_CORE_PROC, 10, -1},
+        {68, 0, 0, 11, MAIN_CORE_PROC, 11, -1},       {69, 0, 0, 12, MAIN_CORE_PROC, 12, -1},
+        {70, 0, 0, 13, MAIN_CORE_PROC, 13, -1},       {71, 0, 0, 14, MAIN_CORE_PROC, 14, -1},
+        {72, 0, 0, 15, MAIN_CORE_PROC, 15, -1},       {73, 0, 0, 16, MAIN_CORE_PROC, 16, -1},
+        {74, 0, 0, 17, MAIN_CORE_PROC, 17, -1},       {75, 0, 0, 18, MAIN_CORE_PROC, 18, -1},
+        {76, 0, 0, 55, MAIN_CORE_PROC, 55, -1},       {77, 0, 0, 19, MAIN_CORE_PROC, 19, -1},
+        {78, 0, 0, 20, MAIN_CORE_PROC, 20, -1},       {79, 0, 0, 21, MAIN_CORE_PROC, 21, -1},
+        {80, 0, 0, 22, MAIN_CORE_PROC, 22, -1},       {81, 0, 0, 23, MAIN_CORE_PROC, 23, -1},
+        {82, 0, 0, 24, MAIN_CORE_PROC, 24, -1},       {83, 0, 0, 25, MAIN_CORE_PROC, 25, -1},
+        {84, 1, 1, 26, MAIN_CORE_PROC, 26, -1},       {85, 1, 1, 27, MAIN_CORE_PROC, 27, -1},
+        {86, 1, 1, 28, MAIN_CORE_PROC, 28, -1},       {87, 1, 1, 29, MAIN_CORE_PROC, 29, -1},
+        {88, 1, 1, 30, MAIN_CORE_PROC, 30, -1},       {89, 1, 1, 31, MAIN_CORE_PROC, 31, -1},
+        {90, 1, 1, 32, MAIN_CORE_PROC, 32, -1},       {91, 1, 1, 33, MAIN_CORE_PROC, 33, -1},
+        {92, 1, 1, 34, MAIN_CORE_PROC, 34, -1},       {93, 1, 1, 35, MAIN_CORE_PROC, 35, -1},
+        {94, 1, 1, 36, MAIN_CORE_PROC, 36, -1},       {95, 1, 1, 37, MAIN_CORE_PROC, 37, -1},
+        {96, 1, 1, 38, MAIN_CORE_PROC, 38, -1},       {97, 1, 1, 39, MAIN_CORE_PROC, 39, -1},
+        {98, 1, 1, 40, MAIN_CORE_PROC, 40, -1},       {99, 1, 1, 41, MAIN_CORE_PROC, 41, -1},
+        {100, 1, 1, 42, MAIN_CORE_PROC, 42, -1},      {101, 1, 1, 43, MAIN_CORE_PROC, 43, -1},
+        {102, 1, 1, 44, MAIN_CORE_PROC, 44, -1},      {103, 1, 1, 45, MAIN_CORE_PROC, 45, -1},
+        {104, 1, 1, 46, MAIN_CORE_PROC, 46, -1},      {105, 1, 1, 47, MAIN_CORE_PROC, 47, -1},
+        {106, 1, 1, 48, MAIN_CORE_PROC, 48, -1},      {107, 1, 1, 49, MAIN_CORE_PROC, 49, -1},
+        {108, 1, 1, 50, MAIN_CORE_PROC, 50, -1},      {109, 1, 1, 51, MAIN_CORE_PROC, 51, -1},
+        {110, 1, 1, 52, MAIN_CORE_PROC, 52, -1},      {111, 1, 1, 53, MAIN_CORE_PROC, 53, -1},
+    },
+    {
+        {"0,56", "0", "3500000"},
+        {"1,57", "0", "3500000"},
+        {"2,58", "0", "3500000"},
+        {"3,59", "0", "3500000"},
+        {"4,60", "0", "3500000"},
+        {"5,61", "0", "3500000"},
+        {"6,62", "0", "3500000"},
+        {"7,63", "0", "3500000"},
+        {"8,64", "0", "3500000"},
+        {"9,65", "0", "3500000"},
+        {"", "", ""},
+        {"11,67", "0", "3500000"},
+        {"12,68", "0", "3500000"},
+        {"13,69", "0", "3500000"},
+        {"14,70", "0", "3500000"},
+        {"15,71", "0", "3500000"},
+        {"16,72", "0", "3500000"},
+        {"17,73", "0", "3500000"},
+        {"18,74", "0", "3500000"},
+        {"19,75", "0", "3500000"},
+        {"", "", ""},
+        {"21,77", "0", "3500000"},
+        {"22,78", "0", "3500000"},
+        {"23,79", "0", "3500000"},
+        {"24,80", "0", "3500000"},
+        {"25,81", "0", "3500000"},
+        {"26,82", "0", "3500000"},
+        {"27,83", "0", "3500000"},
+        {"28,84", "1", "3500000"},
+        {"29,85", "1", "3500000"},
+        {"30,86", "1", "3500000"},
+        {"31,87", "1", "3500000"},
+        {"32,88", "1", "3500000"},
+        {"33,89", "1", "3500000"},
+        {"34,90", "1", "3500000"},
+        {"35,91", "1", "3500000"},
+        {"36,92", "1", "3500000"},
+        {"37,93", "1", "3500000"},
+        {"38,94", "1", "3500000"},
+        {"39,95", "1", "3500000"},
+        {"40,96", "1", "3500000"},
+        {"41,97", "1", "3500000"},
+        {"42,98", "1", "3500000"},
+        {"43,99", "1", "3500000"},
+        {"44,100", "1", "3500000"},
+        {"45,101", "1", "3500000"},
+        {"46,102", "1", "3500000"},
+        {"47,103", "1", "3500000"},
+        {"48,104", "1", "3500000"},
+        {"49,105", "1", "3500000"},
+        {"50,106", "1", "3500000"},
+        {"51,107", "1", "3500000"},
+        {"52,108", "1", "3500000"},
+        {"53,109", "1", "3500000"},
+        {"54,110", "1", "3500000"},
+        {"55,111", "1", "3500000"},
+        {"0,56", "0", "3500000"},
+        {"1,57", "0", "3500000"},
+        {"2,58", "0", "3500000"},
+        {"3,59", "0", "3500000"},
+        {"4,60", "0", "3500000"},
+        {"5,61", "0", "3500000"},
+        {"6,62", "0", "3500000"},
+        {"7,63", "0", "3500000"},
+        {"8,64", "0", "3500000"},
+        {"9,65", "0", "3500000"},
+        {"66", "0", "3500000"},
+        {"11,67", "0", "3500000"},
+        {"12,68", "0", "3500000"},
+        {"13,69", "0", "3500000"},
+        {"14,70", "0", "3500000"},
+        {"15,71", "0", "3500000"},
+        {"16,72", "0", "3500000"},
+        {"17,73", "0", "3500000"},
+        {"18,74", "0", "3500000"},
+        {"19,75", "0", "3500000"},
+        {"76", "0", "3500000"},
+        {"21,77", "0", "3500000"},
+        {"22,78", "0", "3500000"},
+        {"23,79", "0", "3500000"},
+        {"24,80", "0", "3500000"},
+        {"25,81", "0", "3500000"},
+        {"26,82", "0", "3500000"},
+        {"27,83", "0", "3500000"},
+        {"28,84", "1", "3500000"},
+        {"29,85", "1", "3500000"},
+        {"30,86", "1", "3500000"},
+        {"31,87", "1", "3500000"},
+        {"32,88", "1", "3500000"},
+        {"33,89", "1", "3500000"},
+        {"34,90", "1", "3500000"},
+        {"35,91", "1", "3500000"},
+        {"36,92", "1", "3500000"},
+        {"37,93", "1", "3500000"},
+        {"38,94", "1", "3500000"},
+        {"39,95", "1", "3500000"},
+        {"40,96", "1", "3500000"},
+        {"41,97", "1", "3500000"},
+        {"42,98", "1", "3500000"},
+        {"43,99", "1", "3500000"},
+        {"44,100", "1", "3500000"},
+        {"45,101", "1", "3500000"},
+        {"46,102", "1", "3500000"},
+        {"47,103", "1", "3500000"},
+        {"48,104", "1", "3500000"},
+        {"49,105", "1", "3500000"},
+        {"50,106", "1", "3500000"},
+        {"51,107", "1", "3500000"},
+        {"52,108", "1", "3500000"},
+        {"53,109", "1", "3500000"},
+        {"54,110", "1", "3500000"},
+        {"55,111", "1", "3500000"},
+    },
+    {
+        {"0-9,11-19,21-27,56-83"},
+        {"28-55,84-111"},
+    },
+};
 LinuxCpuMapTestCase freq_2sockets_48cores_hyperthreading = {
     96,
     2,
@@ -987,6 +1169,7 @@ TEST_P(LinuxCpuMapFreqParserTests, LinuxFreq) {}
 INSTANTIATE_TEST_SUITE_P(CPUMap,
                          LinuxCpuMapFreqParserTests,
                          testing::Values(freq_2sockets_112cores_hyperthreading,
+                                         freq_2sockets_56cores_hyperthreading,
                                          freq_2sockets_48cores_hyperthreading,
                                          freq_2sockets_48cores_hyperthreading_1,
                                          freq_2sockets_24cores_hyperthreading,
diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp
index 67c538bd78341a..865ec1f692b762 100644
--- a/src/plugins/intel_cpu/src/cpu_types.cpp
+++ b/src/plugins/intel_cpu/src/cpu_types.cpp
@@ -144,6 +144,7 @@ static const TypeToNameMap& get_type_to_name_tbl() {
         {"Loop", Type::TensorIterator},
         {"ReadValue", Type::MemoryInput},  // for construction from name ctor, arbitrary name is used
         {"Assign", Type::MemoryOutput},    // for construction from layer ctor
+        {"ReadValueWithSubgraph", Type::MemoryInput},
         {"Convert", Type::Convert},
         {"NV12toRGB", Type::ColorConvert},
         {"NV12toBGR", Type::ColorConvert},
diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
index 457f8368f734dd..1c5598b6d55e26 100644
--- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
+++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
@@ -36,6 +36,8 @@ uint8_t DnnlExtensionUtils::sizeOfDataType(dnnl::memory::data_type dataType) {
     case dnnl::memory::data_type::s4:
     case dnnl::memory::data_type::u4:
     case dnnl::memory::data_type::f8_e8m0:
+    case dnnl::memory::data_type::f8_e4m3:
+    case dnnl::memory::data_type::f8_e5m2:
     case dnnl::memory::data_type::f4_e2m1:
         return 1;
     case dnnl::memory::data_type::undef:
@@ -70,6 +72,10 @@ dnnl::memory::data_type DnnlExtensionUtils::ElementTypeToDataType(const ov::elem
         return memory::data_type::u4;
     case ov::element::f8e8m0:
         return memory::data_type::f8_e8m0;
+    case ov::element::f8e4m3:
+        return memory::data_type::f8_e4m3;
+    case ov::element::f8e5m2:
+        return memory::data_type::f8_e5m2;
     case ov::element::f4e2m1:
         return memory::data_type::f4_e2m1;
     case ov::element::undefined:
@@ -106,6 +112,10 @@ ov::element::Type DnnlExtensionUtils::DataTypeToElementType(const dnnl::memory::
         return ov::element::u4;
     case memory::data_type::f8_e8m0:
         return ov::element::f8e8m0;
+    case memory::data_type::f8_e4m3:
+        return ov::element::f8e4m3;
+    case memory::data_type::f8_e5m2:
+        return ov::element::f8e5m2;
     case memory::data_type::f4_e2m1:
         return ov::element::f4e2m1;
     case memory::data_type::undef:
diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp
index 2bfbaa68880aa8..6ad7d758b9ff07 100644
--- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp
@@ -11,13 +11,14 @@ namespace intel_cpu {
 
 class jit_uni_vcvtneps2bf16 : public jit_emitter {
 public:
+    enum class conversion_mode { default_mode, saturation_mode };
     jit_uni_vcvtneps2bf16(dnnl::impl::cpu::x64::jit_generator* host,
                           dnnl::impl::cpu::x64::cpu_isa_t host_isa,
-                          ov::element::Type exec_prc = ov::element::bf16)
+                          ov::element::Type exec_prc = ov::element::bf16,
+                          conversion_mode mode = conversion_mode::default_mode)
         : jit_emitter(host, host_isa, exec_prc) {
-        if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16) &&
-            !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2))
-            prepare_table();
+        prepare_table();
+        mode_ = mode;
     }
 
     size_t get_inputs_num() const override {
@@ -25,6 +26,7 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter {
     }
 
 private:
+    conversion_mode mode_ = conversion_mode::default_mode;
     void emit_impl(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs) const override {
         if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
             emit_isa<dnnl::impl::cpu::x64::avx512_core>(in_vec_idxs, out_vec_idxs);
@@ -44,6 +46,25 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter {
             conditional3<isa == dnnl::impl::cpu::x64::sse41, Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
 
         Vmm in = Vmm(in_vec_idxs[0]);
+        if (mode_ == conversion_mode::saturation_mode) {
+            Vmm vmm_temp = Vmm(out_vec_idxs[0]);
+
+            h->uni_vmaxps(vmm_temp, in, table_val("bf16_min"));
+            h->uni_vminps(vmm_temp, vmm_temp, table_val("bf16_max"));
+
+            if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
+                h->vfixupimmps(vmm_temp, in, table_val("selector"), 0);
+            } else {
+                Vmm mask = Vmm(aux_vec_idxs[0]);
+                h->uni_vcmpps(mask, in, in, 0x03);  // _CMP_UNORD_Q
+                h->uni_vblendvps(vmm_temp, vmm_temp, table_val("nan"), mask);
+                h->uni_vcmpps(mask, in, table_val("inf"), 0x00);  // _CMP_EQ_OQ
+                h->uni_vblendvps(vmm_temp, vmm_temp, table_val("inf"), mask);
+                h->uni_vcmpps(mask, in, table_val("neg_inf"), 0x00);  // _CMP_EQ_OQ
+                h->uni_vblendvps(vmm_temp, vmm_temp, table_val("neg_inf"), mask);
+            }
+            h->uni_vmovups(in, vmm_temp);
+        }
 
         if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) {
             Ymm out = Ymm(out_vec_idxs[0]);
@@ -119,6 +140,11 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter {
         push_arg_entry_of("rounding", 0x00010000, true);
         push_arg_entry_of("selector", selector_int32, true);
         push_arg_entry_of("mask_truncation_word", 0x0000ffff, true);
+        push_arg_entry_of("bf16_max", 0x7F7F0000, true);
+        push_arg_entry_of("bf16_min", 0xFF7F0000, true);
+        push_arg_entry_of("nan", 0x7FC00000, true);
+        push_arg_entry_of("inf", 0x7F800000, true);
+        push_arg_entry_of("neg_inf", 0xFF800000, true);
     }
 
     size_t aux_vecs_count() const override {
diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
index bdb5211009a22a..95de3720bb1e25 100644
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@@ -23,6 +23,7 @@
 #include "transformations/cpu_opset/common/op/leaky_relu.hpp"
 #include "transformations/cpu_opset/common/op/ngram.hpp"
 #include "transformations/cpu_opset/common/op/power_static.hpp"
+#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp"
 #include "transformations/cpu_opset/common/op/sdpa.hpp"
 #include "transformations/cpu_opset/common/op/swish_cpu.hpp"
 #include "transformations/cpu_opset/x64/op/interaction.hpp"
@@ -78,6 +79,7 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
     OP_EXTENSION(ov::intel_cpu::SwishNode)                                  \
     OP_EXTENSION(ov::intel_cpu::SDPAWithTransposeReshape)                   \
     OP_EXTENSION(ov::intel_cpu::NgramNode)                                  \
+    OP_EXTENSION(ov::intel_cpu::ReadValueWithSubgraph)                      \
     OP_EXTENSION(ov::op::internal::GatherCompressed)                        \
     OP_EXTENSION(ov::op::internal::NonMaxSuppressionIEInternal)             \
     OP_EXTENSION(ov::op::internal::MulticlassNmsIEInternal)                 \
diff --git a/src/plugins/intel_cpu/src/graph_dumper.cpp b/src/plugins/intel_cpu/src/graph_dumper.cpp
index ffd58fdb162899..3cdd2f389d29f8 100644
--- a/src/plugins/intel_cpu/src/graph_dumper.cpp
+++ b/src/plugins/intel_cpu/src/graph_dumper.cpp
@@ -357,6 +357,10 @@ void average_counters(const Graph& graph) {
      * - <nesting-level>_<graph-name>.csv
      * For example: 0_MyModel.csv
      */
+    if (!graph.getGraphContext()) {
+        DEBUG_LOG("graph.m_context is null. Don't dump average_counters.");
+        return;
+    }
 
     const std::string& path = graph.getConfig().debugCaps.averageCountersPath;
 
diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp
index fe0df309dc32f1..1cab7ab7d8c60a 100644
--- a/src/plugins/intel_cpu/src/graph_optimizer.cpp
+++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp
@@ -2935,12 +2935,19 @@ void GraphOptimizer::MatchSdpaKvCache(Graph& graph) {
         auto memInputNode = std::dynamic_pointer_cast<node::MemoryInputBase>(node);
         OPENVINO_ASSERT(memInputNode, "MemoryInput node ", node->getName(), " has unexpected dynamic type");
 
-        ov::optional<Shape> input_shape;
-        ov::optional<ov::element::Type> input_prc;
-
+        ov::optional<std::vector<Shape>> inputShapes;
+        ov::optional<std::vector<ov::element::Type>> inputPrcs;
         if (!node->getParentEdges().empty()) {
-            input_shape = ov::optional<Shape>(node->getInputShapeAtPort(0));
-            input_prc = ov::optional<ov::element::Type>(node->getOriginalInputPrecisionAtPort(0));
+            inputShapes = ov::optional<std::vector<Shape>>(std::vector<Shape>{});
+            inputPrcs = ov::optional<std::vector<ov::element::Type>>(std::vector<ov::element::Type>{});
+
+            auto& input_shape_vec = *inputShapes;
+            auto& input_prc_vec = *inputPrcs;
+
+            for (size_t i = 0; i < node->getParentEdges().size(); i++) {
+                input_shape_vec.push_back(node->getInputShapeAtPort(i));
+                input_prc_vec.push_back(node->getOriginalInputPrecisionAtPort(i));
+            }
         }
 
         // search for SDPA
@@ -2966,8 +2973,8 @@ void GraphOptimizer::MatchSdpaKvCache(Graph& graph) {
                                                               memInputNode->getOutputShapeAtPort(0),
                                                               memInputNode->getOriginalOutputPrecisionAtPort(0),
                                                               graph.getGraphContext(),
-                                                              input_shape,
-                                                              input_prc,
+                                                              inputShapes,
+                                                              inputPrcs,
                                                               sdpa);
 
         if (!memInputNode->getParentEdges().empty()) {
@@ -3064,12 +3071,18 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph& graph) {
         auto memInputNode = std::dynamic_pointer_cast<node::MemoryInputBase>(node);
         OPENVINO_ASSERT(memInputNode, "MemoryInput node ", node->getName(), " has unexpected dynamic type");
 
-        ov::optional<Shape> inputShape;
-        ov::optional<ov::element::Type> inputPrc;
-
+        ov::optional<std::vector<Shape>> inputShapes;
+        ov::optional<std::vector<ov::element::Type>> inputPrcs;
         if (!node->getParentEdges().empty()) {
-            inputShape = ov::optional<Shape>(node->getInputShapeAtPort(0));
-            inputPrc = ov::optional<ov::element::Type>(node->getOriginalInputPrecisionAtPort(0));
+            inputShapes = ov::optional<std::vector<Shape>>(std::vector<Shape>{});
+            inputPrcs = ov::optional<std::vector<ov::element::Type>>(std::vector<ov::element::Type>{});
+
+            auto& input_shape_vec = *inputShapes;
+            auto& input_prc_vec = *inputPrcs;
+            for (size_t i = 0; i < node->getParentEdges().size(); i++) {
+                input_shape_vec.push_back(node->getInputShapeAtPort(i));
+                input_prc_vec.push_back(node->getOriginalInputPrecisionAtPort(i));
+            }
         }
 
         // search for the MemoryOutputNode
@@ -3086,6 +3099,10 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph& graph) {
         graph.RemoveEdge(memoryOutputNode->getParentEdgeAt(0));
         // there are no output edges from MemoryOutput nodes
 
+        CPU_GRAPH_OPTIMIZER_SCOPE(DropRedundantMemoryOutput_SubGraph);
+        auto memInpNd = std::dynamic_pointer_cast<node::MemoryInput>(node);
+        OPENVINO_ASSERT(memInpNd, "MemoryInput node ", node->getName(), " has unexpected dynamic type");
+
         // now replace the existing MemoryInput with a special type that works without the corresponding MemoryOutput
         auto memInputSingle = std::make_shared<MemoryInputSingle>(memInputNode->getId(),
                                                                   memInputNode->getName(),
@@ -3093,17 +3110,24 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph& graph) {
                                                                   memInputNode->getOutputShapeAtPort(0),
                                                                   memInputNode->getOriginalOutputPrecisionAtPort(0),
                                                                   graph.getGraphContext(),
-                                                                  inputShape,
-                                                                  inputPrc);
-
+                                                                  inputShapes,
+                                                                  inputPrcs,
+                                                                  memInpNd->getSubGraph());
         graph.AddNode(memInputSingle);
 
         if (!memInputNode->getParentEdges().empty()) {
-            auto parentEdge = memInputNode->getParentEdgeAt(0);
-            auto parent = parentEdge->getParent();
-            const auto inputNum = parentEdge->getInputNum();
-            graph.RemoveEdge(parentEdge);
-            graph.CreateEdge(parent, memInputSingle, inputNum, 0);
+            auto parentEdgeNum = memInputNode->getParentEdges().size();
+            std::vector<ov::intel_cpu::EdgePtr> parentEdges;
+            for (size_t i = 0; i < parentEdgeNum; i++) {
+                auto parentEdge = memInputNode->getParentEdgeAt(i);
+                auto parent = parentEdge->getParent();
+                const auto inputNum = parentEdge->getInputNum();
+                parentEdges.push_back(parentEdge);
+                graph.CreateEdge(parent, memInputSingle, inputNum, parentEdge->getOutputNum());
+            }
+            for (auto parentEdge : parentEdges) {
+                graph.RemoveEdge(parentEdge);
+            }
         }
 
         for (auto&& edge : memInputNode->getChildEdgesAtPort(0)) {
diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp
index 0c8cddd905dc2e..f6aabe376d6eec 100644
--- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp
+++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp
@@ -9,6 +9,7 @@
 #include "utils/bfloat16.hpp"
 
 #if defined(OPENVINO_ARCH_X86_64)
+#    include "cpu/x64/jit_avx512_core_fp8cvt.hpp"
 #    include "nodes/kernels/x64/jit_kernel.hpp"
 #else
 #    include "cpu_memory.h"
@@ -27,6 +28,18 @@ using namespace dnnl::impl::utils;
 using namespace dnnl::impl::cpu::x64;
 using namespace Xbyak;
 
+enum f8_type { none, f8e4m3, f8e5m2 };
+
+template <typename src_t, typename dst_t>
+f8_type get_f8_type() {
+    if (std::is_same<src_t, ov::float8_e4m3>::value || std::is_same<dst_t, ov::float8_e4m3>::value) {
+        return f8_type::f8e4m3;
+    } else if (std::is_same<src_t, ov::float8_e5m2>::value || std::is_same<dst_t, ov::float8_e5m2>::value) {
+        return f8_type::f8e5m2;
+    }
+    return f8_type::none;
+}
+
 template <typename src_t, typename dst_t>
 void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst);
 
@@ -50,12 +63,14 @@ void convert_vec<float, ov::float16>(jit_generator& gen, const RegExp& src, cons
     gen.movdqu(gen.xword[dst], f16vec);
 }
 
+template <typename src_t, typename dst_t>
 class jit_convert_array : public jit_kernel {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_convert_array)
 
     void generate() override {
-        constexpr size_t vlen = 8u;
-        constexpr size_t vlen_log2 = 3;
+        bool is_fp8 = f8_e4m3_emu_ || f8_e5m2_emu_;
+        size_t vlen = is_fp8 ? 16u : 8u;
+        size_t vlen_log2 = is_fp8 ? 4 : 3;
 
         preamble();
 
@@ -84,17 +99,24 @@ class jit_convert_array : public jit_kernel {
             auto tail_size = var<size_t>();
 
             tail_size = size;
-            tail_size <<= static_cast<size_t>(std::logb(_src_size)) - 1;
-            copy<uint16_t>(tmp.pointer(), src, tail_size);
+            tail_size <<= static_cast<size_t>(std::logb(_src_size));
+            copy<uint8_t>(tmp.pointer(), src, tail_size);
 
             _convert_vec(*this, tmp.pointer(), tmp.pointer());
 
             tail_size = size;
-            tail_size <<= static_cast<size_t>(std::logb(_dst_size)) - 1;
-            copy<uint16_t>(dst, tmp.pointer(), tail_size);
+            tail_size <<= static_cast<size_t>(std::logb(_dst_size));
+            copy<uint8_t>(dst, tmp.pointer(), tail_size);
         });
 
         postamble();
+
+        if (f8_e4m3_emu_)
+            f8_e4m3_emu_->prepare_table();
+        if (f8_e5m2_emu_)
+            f8_e5m2_emu_->prepare_table();
+        if (uni_vcvtneps2bf16_)
+            uni_vcvtneps2bf16_->emit_data();
     }
 
 public:
@@ -108,16 +130,37 @@ class jit_convert_array : public jit_kernel {
 
     typedef void (*convert_vec_t)(jit_generator&, const RegExp&, const RegExp&);
 
-    jit_convert_array(convert_vec_t convert_vec, size_t src_size, size_t dst_size)
+    jit_convert_array(convert_vec_t convert_vec)
         : jit_kernel(jit_name()),
           _convert_vec(convert_vec),
-          _src_size(src_size),
-          _dst_size(dst_size) {}
+          _src_size(sizeof(src_t)),
+          _dst_size(sizeof(dst_t)) {
+        const auto type = get_f8_type<src_t, dst_t>();
+        if (type == f8_type::f8e4m3) {
+            f8_e4m3_emu_ = std::make_shared<fp8_emulation_e4m3_t>(this,
+                                                                  fp8_emu_reserv_1_,
+                                                                  fp8_emu_reserv_2_,
+                                                                  fp8_emu_reserv_3_,
+                                                                  fp8_emu_reserv_4_,
+                                                                  fp8_emu_reserv_5_,
+                                                                  fp8_emu_scratch_);
+        } else if (type == f8_type::f8e5m2) {
+            f8_e5m2_emu_ = std::make_shared<fp8_emulation_e5m2_t>(this,
+                                                                  fp8_emu_reserv_1_,
+                                                                  fp8_emu_reserv_2_,
+                                                                  fp8_emu_reserv_3_,
+                                                                  fp8_emu_kmask_aux_,
+                                                                  fp8_emu_scratch_);
+        }
+        const bool is_dst_bf16 = std::is_same<dst_t, ov::intel_cpu::bfloat16_t>::value;
+        if (is_dst_bf16 && mayiuse(cpu_isa_t::avx512_core)) {
+            uni_vcvtneps2bf16_ = std::make_shared<jit_uni_vcvtneps2bf16>(this, cpu_isa_t::avx512_core);
+        }
+    }
 
-    template <typename src_t, typename dst_t>
     static fn_t get() {
         if (mayiuse(cpu_isa_t::avx2) && dnnl::impl::cpu::x64::cpu().has(Xbyak::util::Cpu::tF16C)) {
-            static jit_convert_array converter(convert_vec<src_t, dst_t>, sizeof(src_t), sizeof(dst_t));
+            static jit_convert_array converter(convert_vec<src_t, dst_t>);
             auto& generator = static_cast<jit_generator&>(converter);
             generator.create_kernel();
             return (fn_t)generator.jit_ker();
@@ -125,16 +168,192 @@ class jit_convert_array : public jit_kernel {
         return nullptr;
     }
 
+    std::shared_ptr<fp8_emulation_e4m3_t> get_f8_e4m3_emu() const {
+        return f8_e4m3_emu_;
+    }
+
+    std::shared_ptr<fp8_emulation_e5m2_t> get_f8_e5m2_emu() const {
+        return f8_e5m2_emu_;
+    }
+
+    std::shared_ptr<jit_uni_vcvtneps2bf16> get_uni_vcvtneps2bf16() const {
+        return uni_vcvtneps2bf16_;
+    }
+
 private:
     convert_vec_t _convert_vec;
     size_t _src_size;
     size_t _dst_size;
+
+    std::shared_ptr<fp8_emulation_e4m3_t> f8_e4m3_emu_;
+    std::shared_ptr<fp8_emulation_e5m2_t> f8_e5m2_emu_;
+    std::shared_ptr<jit_uni_vcvtneps2bf16> uni_vcvtneps2bf16_;
+
+    const Reg64 fp8_emu_scratch_ = rax;
+    const Zmm fp8_emu_reserv_1_ = Zmm(9);
+    const Zmm fp8_emu_reserv_2_ = Zmm(10);
+    const Zmm fp8_emu_reserv_3_ = Zmm(11);
+    const Zmm fp8_emu_reserv_4_ = Zmm(12);
+    const Zmm fp8_emu_reserv_5_ = Zmm(13);
+    const Opmask fp8_emu_kmask_aux_ = Opmask(1);
 };
 
+template <>
+void convert_vec<float, ov::float8_e4m3>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f32vec = gen.zmm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<float, ov::float8_e4m3>&>(gen);
+
+    gen.vmovups(f32vec, gen.zword[src]);
+    cvt.get_f8_e4m3_emu()->vcvt_f32_to_f8(f8vec, f32vec);
+    gen.vmovdqu(gen.xword[dst], f8vec);
+}
+
+template <>
+void convert_vec<ov::float8_e4m3, float>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f32vec = gen.zmm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::float8_e4m3, float>&>(gen);
+
+    gen.vmovdqu(f8vec, gen.xword[src]);
+    cvt.get_f8_e4m3_emu()->vcvt_f8_to_f32(f32vec, f8vec);
+    gen.vmovups(gen.zword[dst], f32vec);
+}
+
+template <>
+void convert_vec<ov::float16, ov::float8_e4m3>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f16vec = gen.ymm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::float16, ov::float8_e4m3>&>(gen);
+
+    gen.vmovdqu(f16vec, gen.yword[src]);
+    cvt.get_f8_e4m3_emu()->vcvt_f16_to_f8(f8vec, f16vec);
+    gen.vmovdqu(gen.xword[dst], f8vec);
+}
+
+template <>
+void convert_vec<ov::float8_e4m3, ov::float16>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f16vec = gen.ymm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::float8_e4m3, ov::float16>&>(gen);
+
+    gen.vmovdqu(f8vec, gen.xword[src]);
+    cvt.get_f8_e4m3_emu()->vcvt_f8_to_f16(f16vec, f8vec);
+    gen.vmovdqu(gen.yword[dst], f16vec);
+}
+
+template <>
+void convert_vec<ov::intel_cpu::bfloat16_t, ov::float8_e4m3>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f16vec = gen.zmm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::intel_cpu::bfloat16_t, ov::float8_e4m3>&>(gen);
+
+    gen.vpmovzxwd(f16vec, gen.yword[src]);
+    gen.vpslld(f16vec, f16vec, 16);
+    cvt.get_f8_e4m3_emu()->vcvt_f32_to_f8(f8vec, f16vec);
+    gen.vmovdqu(gen.xword[dst], f8vec);
+}
+
+template <>
+void convert_vec<ov::float8_e4m3, ov::intel_cpu::bfloat16_t>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f16vec = gen.ymm4;
+    auto const& f32vec = gen.zmm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::float8_e4m3, ov::intel_cpu::bfloat16_t>&>(gen);
+
+    gen.vmovdqu(f8vec, gen.xword[src]);
+    cvt.get_f8_e4m3_emu()->vcvt_f8_to_f32(f32vec, f8vec);
+    cvt.get_uni_vcvtneps2bf16()->emit_code({static_cast<size_t>(f32vec.getIdx())},
+                                           {static_cast<size_t>(f16vec.getIdx())});
+    gen.vmovdqu(gen.yword[dst], f16vec);
+}
+
+template <>
+void convert_vec<float, ov::float8_e5m2>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f32vec = gen.zmm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<float, ov::float8_e5m2>&>(gen);
+
+    gen.vmovups(f32vec, gen.zword[src]);
+    cvt.get_f8_e5m2_emu()->vcvt_f32_to_f8(f8vec, f32vec);
+    gen.vmovdqu(gen.xword[dst], f8vec);
+}
+
+template <>
+void convert_vec<ov::float8_e5m2, float>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f32vec = gen.zmm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::float8_e5m2, float>&>(gen);
+
+    gen.vmovdqu(f8vec, gen.xword[src]);
+    cvt.get_f8_e5m2_emu()->vcvt_f8_to_f32(f32vec, f8vec);
+    gen.vmovups(gen.zword[dst], f32vec);
+}
+
+template <>
+void convert_vec<ov::float16, ov::float8_e5m2>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f16vec = gen.ymm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::float16, ov::float8_e5m2>&>(gen);
+
+    gen.vmovdqu(f16vec, gen.yword[src]);
+    cvt.get_f8_e5m2_emu()->vcvt_f16_to_f8(f8vec, f16vec);
+    gen.vmovdqu(gen.xword[dst], f8vec);
+}
+
+template <>
+void convert_vec<ov::float8_e5m2, ov::float16>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f16vec = gen.ymm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::float8_e5m2, ov::float16>&>(gen);
+
+    gen.vmovdqu(f8vec, gen.xword[src]);
+    cvt.get_f8_e5m2_emu()->vcvt_f8_to_f16(f16vec, f8vec);
+    gen.vmovdqu(gen.yword[dst], f16vec);
+}
+
+template <>
+void convert_vec<ov::intel_cpu::bfloat16_t, ov::float8_e5m2>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f16vec = gen.zmm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::intel_cpu::bfloat16_t, ov::float8_e5m2>&>(gen);
+
+    gen.vpmovzxwd(f16vec, gen.yword[src]);
+    gen.vpslld(f16vec, f16vec, 16);
+    cvt.get_f8_e5m2_emu()->vcvt_f32_to_f8(f8vec, f16vec);
+    gen.vmovdqu(gen.xword[dst], f8vec);
+}
+
+template <>
+void convert_vec<ov::float8_e5m2, ov::intel_cpu::bfloat16_t>(jit_generator& gen, const RegExp& src, const RegExp& dst) {
+    auto const& f8vec = gen.xmm3;
+    auto const& f16vec = gen.ymm4;
+    auto const& f32vec = gen.zmm4;
+
+    auto& cvt = dynamic_cast<jit_convert_array<ov::float8_e5m2, ov::intel_cpu::bfloat16_t>&>(gen);
+
+    gen.vmovdqu(f8vec, gen.xword[src]);
+    cvt.get_f8_e5m2_emu()->vcvt_f8_to_f32(f32vec, f8vec);
+    cvt.get_uni_vcvtneps2bf16()->emit_code({static_cast<size_t>(f32vec.getIdx())},
+                                           {static_cast<size_t>(f16vec.getIdx())});
+    gen.vmovdqu(gen.yword[dst], f16vec);
+}
+
 template <typename TI, typename TO>
 void jit_convert(const TI* arg, TO* out, size_t count) {
-    using jit_impl = jit_convert_array;
-    static auto converter = jit_impl::get<TI, TO>();
+    using jit_impl = jit_convert_array<TI, TO>;
+    static auto converter = jit_impl::get();
 
     if (converter) {
         typename jit_impl::args_t args = {arg, out, count};
@@ -185,6 +404,12 @@ const std::tuple<U, U>& Range<T, U>::fit(const ov::element::Type& prec) {
     if (prec.is_real()) {
         double lbound, ubound;
         switch (prec) {
+        case ov::element::f8e4m3:
+            lbound = static_cast<double>(std::numeric_limits<ov::float8_e4m3>::lowest());
+            ubound = static_cast<double>(std::numeric_limits<ov::float8_e4m3>::max());
+        case ov::element::f8e5m2:
+            lbound = static_cast<double>(std::numeric_limits<ov::float8_e5m2>::lowest());
+            ubound = static_cast<double>(std::numeric_limits<ov::float8_e5m2>::max());
         case ov::element::bf16:
             lbound = static_cast<double>(std::numeric_limits<ov::intel_cpu::bfloat16_t>::lowest());
             ubound = static_cast<double>(std::numeric_limits<ov::intel_cpu::bfloat16_t>::max());
@@ -293,6 +518,18 @@ struct ConvertPrecision<std::tuple<src_t, dst_t>> {
         src_t lbound, ubound;
         std::tie(lbound, ubound) = ctx.range<src_t>();
 
+        // Align with the behavior of ngraph ref and jit implementation. Conversion from f8e4m3-inf
+        // to float should output float-inf instead of f8e4m3-max. Proper handling of special values
+        // (nan, inf, overflow) has already been assured by the conversion process.
+        if (std::is_same<src_t, ov::float8_e4m3>::value || std::is_same<src_t, ov::float8_e5m2>::value ||
+            std::is_same<dst_t, ov::float8_e4m3>::value || std::is_same<dst_t, ov::float8_e5m2>::value) {
+            parallel_for(ctx.size, [&](size_t i) {
+                dst[i] = static_cast<dst_t>(src[i]);
+            });
+            ctx.converted = true;
+            return;
+        }
+
         if (std::is_integral<src_t>::value || ctx.interimPrc.is_real() || std::is_integral<dst_t>::value) {
             parallel_for(ctx.size, [&](size_t i) {
                 dst[i] = static_cast<dst_t>(std::max(std::min(src[i], ubound), lbound));
@@ -492,6 +729,12 @@ struct ConvertPrecision<std::tuple<ov::float16, ov::float16>> {
              PrecisionInfo<ov::element::ST>::value_type, \
              PrecisionInfo<ov::element::DT>::value_type)
 
+#define INTEL_CPU_CVT_FP8_LIST                                                                                       \
+    INTEL_CPU_CVT(f32, f8e4m3), INTEL_CPU_CVT(f16, f8e4m3), INTEL_CPU_CVT(bf16, f8e4m3), INTEL_CPU_CVT(f8e4m3, f32), \
+        INTEL_CPU_CVT(f8e4m3, f16), INTEL_CPU_CVT(f8e4m3, bf16), INTEL_CPU_CVT(f32, f8e5m2),                         \
+        INTEL_CPU_CVT(f16, f8e5m2), INTEL_CPU_CVT(bf16, f8e5m2), INTEL_CPU_CVT(f8e5m2, f32),                         \
+        INTEL_CPU_CVT(f8e5m2, f16), INTEL_CPU_CVT(f8e5m2, bf16)
+
 #define INTEL_CPU_CVT_LIST                                                                                             \
     INTEL_CPU_CVT(u8, i8), INTEL_CPU_CVT(u8, u16), INTEL_CPU_CVT(u8, i16), INTEL_CPU_CVT(u8, u32),                     \
         INTEL_CPU_CVT(u8, i32), INTEL_CPU_CVT(u8, u64), INTEL_CPU_CVT(u8, i64), INTEL_CPU_CVT(u8, f32),                \
@@ -535,7 +778,8 @@ struct ConvertPrecision<std::tuple<ov::float16, ov::float16>> {
         INTEL_CPU_CVT(boolean, f16), INTEL_CPU_CVT(boolean, bf16), INTEL_CPU_CVT(boolean, f64), INTEL_CPU_CVT(u8, u8), \
         INTEL_CPU_CVT(i8, i8), INTEL_CPU_CVT(u16, u16), INTEL_CPU_CVT(i16, i16), INTEL_CPU_CVT(u32, u32),              \
         INTEL_CPU_CVT(i32, i32), INTEL_CPU_CVT(u64, u64), INTEL_CPU_CVT(i64, i64), INTEL_CPU_CVT(f32, f32),            \
-        INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean)
+        INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean),  \
+        INTEL_CPU_CVT_FP8_LIST
 
 #define INTEL_CPU_CVT_FROM_BIN_LIST                                                                     \
     INTEL_CPU_CVT(u1, f32), INTEL_CPU_CVT(u1, f16), INTEL_CPU_CVT(u1, bf16), INTEL_CPU_CVT(u1, f64),    \
@@ -667,6 +911,35 @@ struct ConvertFromByteFPPrecision<std::tuple<src_t, dst_t>> {
     }
 };
 
+#if defined(OPENVINO_ARCH_X86_64)
+struct ConvertFP8Context {
+    const void* srcPtr;
+    void* dstPtr;
+    size_t size;
+    bool converted;
+};
+
+template <typename T>
+struct ConvertFP8Precision;
+
+template <typename src_t, typename dst_t>
+struct ConvertFP8Precision<std::tuple<src_t, dst_t>> {
+    void operator()(ConvertFP8Context& ctx) {
+        auto src = static_cast<const src_t*>(ctx.srcPtr);
+        auto dst = static_cast<dst_t*>(ctx.dstPtr);
+        constexpr size_t batch = 64;
+        const size_t iterations = ov::intel_cpu::div_up(ctx.size, batch);
+        parallel_for(iterations, [&](size_t i) {
+            const size_t offset = i * batch;
+            const size_t current_batch_size = std::min(ctx.size - offset, batch);
+            jit_convert(src + offset, dst + offset, current_batch_size);
+        });
+
+        ctx.converted = true;
+    }
+};
+#endif
+
 void cpu_convert(const void* srcPtr,
                  void* dstPtr,
                  ov::element::Type srcPrc,
@@ -728,7 +1001,7 @@ void cpu_convert(const void* srcPtr,
         OV_SWITCH(intel_cpu, ConvertFrom4BitPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_4BIT_LIST);
         if (!ctx.converted)
             OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc);
-    } else if (srcPrc.bitwidth() == 8u && srcPrc.is_real()) {
+    } else if (srcPrc == ov::element::f8e8m0) {
         ConvertFromByteFPContext ctx{srcPrc, srcPtr, dstPtr, size, false};
         OV_SWITCH(intel_cpu,
                   ConvertFromByteFPPrecision,
@@ -737,6 +1010,15 @@ void cpu_convert(const void* srcPtr,
                   INTEL_CPU_CVT_FROM_BYTE_FP_LIST);
         if (!ctx.converted)
             OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc);
+#if defined(OPENVINO_ARCH_X86_64)
+    } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_fp16) &&
+               (one_of(srcPrc, ov::element::f8e4m3, ov::element::f8e5m2) ||
+                one_of(dstPrc, ov::element::f8e4m3, ov::element::f8e5m2))) {
+        ConvertFP8Context ctx{srcPtr, dstPtr, size, false};
+        OV_SWITCH(intel_cpu, ConvertFP8Precision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FP8_LIST);
+        if (!ctx.converted)
+            OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc);
+#endif
     } else {
         ConvertContext ctx{srcPtr, dstPtr, size, interimPrc, dstPrc, false};
         OV_SWITCH(intel_cpu, ConvertPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_LIST);
diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp
index 5daefa01eddfab..c2e770db84695b 100644
--- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp
+++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp
@@ -341,8 +341,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
                                                                                                       reg_d_bias));
         }
 
-        if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2))
-            uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(this, isa));
+        if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) {
+            auto const mode = jep_.do_output_saturation ? jit_uni_vcvtneps2bf16::conversion_mode::saturation_mode
+                                                        : jit_uni_vcvtneps2bf16::conversion_mode::default_mode;
+            uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(this, isa, element::bf16, mode));
+        }
 
         const auto& jep = jep_;
 
@@ -478,7 +481,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
 
                     apply_post_ops(true, jep_.oc_size > 1 ? j * sizeof(float) : 0);
 
-                    store_scalar(ptr[reg_dst + j * jep.dst_prc.size()], xmm_dst, exec_prc, jep.dst_prc);
+                    store_scalar(ptr[reg_dst + j * jep.dst_prc.size()],
+                                 xmm_dst,
+                                 exec_prc,
+                                 jep.dst_prc,
+                                 jep.do_output_saturation);
                 }
 
                 for (size_t i = 0; i < jep.inputs_number; i++)
@@ -546,7 +553,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
 
             apply_post_ops(true);
 
-            store_scalar(ptr[reg_dst], xmm_dst, exec_prc, jep.dst_prc);
+            store_scalar(ptr[reg_dst], xmm_dst, exec_prc, jep.dst_prc, jep.do_output_saturation);
 
             for (size_t i = 0; i < jep.inputs_number; i++)
                 if (jep.src_size[i] != 1)
@@ -1012,7 +1019,8 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
     inline void store_scalar(const Xbyak::Address& op,
                              Xmm xmm_dst,
                              ov::element::Type src_prc,
-                             ov::element::Type dst_prc) {
+                             ov::element::Type dst_prc,
+                             const bool do_output_saturation) {
         if (src_prc == dst_prc) {
             switch (src_prc.size()) {
             case 4:
@@ -1047,7 +1055,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
             uni_vmovss(op, xmm_dst);
             break;
         case ov::element::bf16:
-            uni_vpsrld(xmm_dst, xmm_dst, 16);
+            if (do_output_saturation)
+                uni_vpsrld(xmm_dst, xmm_dst, 16);
+            else
+                uni_vcvtneps2bf16->emit_code({static_cast<size_t>(xmm_dst.getIdx())},
+                                             {static_cast<size_t>(xmm_dst.getIdx())});
             uni_vpextrw(op, xmm_dst, 0x0);
             break;
         case ov::element::f16:
@@ -1355,6 +1367,7 @@ struct EltwiseKey {
     ov::element::Type outPrc;
     dnnl::post_ops postOps;
     EltwiseImplType implType;
+    bool doOutputSaturation;
 
     size_t hash() const {
         using namespace dnnl::impl;
@@ -1390,6 +1403,10 @@ struct EltwiseKey {
         seed = hash_combine(seed, outPrc.hash());
         seed = get_post_op_hash(seed, *postOps.get());
         seed = hash_combine(seed, implType);
+
+        if (outPrc == ov::element::bf16) {
+            seed = hash_combine(seed, doOutputSaturation);
+        }
         return seed;
     }
 
@@ -1416,6 +1433,8 @@ struct EltwiseKey {
                     result = result && (inpDims[i] == rhs.inpDims[i]);
                 }
             }
+            if (doOutputSaturation != rhs.doOutputSaturation)
+                return false;
         }
 
         return result;
@@ -1448,7 +1467,8 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor {
                        const std::vector<ov::element::Type>& inpPrc,
                        const ov::element::Type& outPrc,
                        const dnnl::post_ops& post_ops,
-                       bool useRuntimePtrs) {
+                       bool useRuntimePtrs,
+                       bool doOutputSaturation) {
         auto collapseLastDims = [](std::vector<size_t>& dims, int dimsToCollapse) {
             for (size_t i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
                 dims[dims.size() - 1] *= dims[i];
@@ -1639,6 +1659,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor {
         jep.dst_prc = outPrc;
         jep.work_amount = jep.dst_size = jep.dims.back();
         jep.oc_size = oc_size;
+        jep.do_output_saturation = doOutputSaturation;
 
         std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(), [](size_t& offset) {
             return offset * sizeof(float);
@@ -2160,7 +2181,8 @@ static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) {
                                                 key.inpPrc,
                                                 key.outPrc,
                                                 key.postOps,
-                                                key.implType == EltwiseImplType::optimizedShapeAgnostic);
+                                                key.implType == EltwiseImplType::optimizedShapeAgnostic,
+                                                key.doOutputSaturation);
 }
 
 bool Eltwise::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
@@ -2862,6 +2884,18 @@ void Eltwise::prepareParams() {
             }
         }
 
+        // FP32 constant inputs may contain values out of BF16 representable range. In case output precision is BF16 we
+        // choose "saturation" mode for fp32->bf16 conversion procedure to prevent getting -Inf/+Inf values in the
+        // outputs. Since "saturation" conversion is more time consuming, better solution would be to clamp constants on
+        // compilation stage (ticket: 159589).
+        key.doOutputSaturation = false;
+        for (size_t i = 0; i < getParentEdges().size(); i++) {
+            if (getParentEdgeAt(i)->getParent()->isConstant()) {
+                key.doOutputSaturation = true;
+                break;
+            }
+        }
+
         auto cache = context->getParamsCache();
         auto result = cache->getOrCreate(key, buildExecutor);
         execPtr = result.first;
diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.h b/src/plugins/intel_cpu/src/nodes/eltwise.h
index d0ca94e08824c8..8e5fd643665ffd 100644
--- a/src/plugins/intel_cpu/src/nodes/eltwise.h
+++ b/src/plugins/intel_cpu/src/nodes/eltwise.h
@@ -43,6 +43,7 @@ struct jit_eltwise_params {
 
     size_t work_amount;
     bool use_runtime_ptrs;
+    bool do_output_saturation;
 };
 
 struct jit_eltwise_call_args_indexes {
diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp
index 4bb2f714b284fd..34b659a1ef2882 100644
--- a/src/plugins/intel_cpu/src/nodes/input.cpp
+++ b/src/plugins/intel_cpu/src/nodes/input.cpp
@@ -11,6 +11,7 @@
 #include "openvino/core/shape.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "shape_inference/shape_inference_pass_through.hpp"
+#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp"
 
 using namespace dnnl;
 using namespace dnnl::impl::cpu::x64;
@@ -226,7 +227,8 @@ Input::Input(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr conte
                 op::v0::Constant::get_type_info_static(),
                 op::v0::Result::get_type_info_static(),
                 op::v3::ReadValue::get_type_info_static(),
-                op::v6::ReadValue::get_type_info_static()))
+                op::v6::ReadValue::get_type_info_static(),
+                ov::intel_cpu::ReadValueWithSubgraph::get_type_info_static()))
         OPENVINO_THROW_NOT_IMPLEMENTED("CPU Input node doesn't support ngraph operation ",
                                        op->get_type_name(),
                                        " with name ",
@@ -479,7 +481,11 @@ void Input::selectOptimalPrimitiveDescriptor() {
     supportedPrimitiveDescriptors.clear();
 
     // and just use parent memory descriptor for Output node to avoid reorders insertion
-    NodeConfig config({PortConfig(getParentOutputMemDesc(getParentEdgeAt(0)), BlockedMemoryDesc::FULL_MASK, 0)}, {});
+    std::vector<PortConfig> inConfs;
+    for (size_t i = 0; i < getParentEdges().size(); i++) {
+        inConfs.push_back({PortConfig(getParentOutputMemDesc(getParentEdgeAt(i)), BlockedMemoryDesc::FULL_MASK, 0)});
+    }
+    NodeConfig config(inConfs, {});
 
     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
     selectPrimitiveDescriptorByIndex(0);
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp
index 1bf64d096e4a84..c4fb7608d521de 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp
@@ -57,6 +57,7 @@ struct jit_eltwise_params {
 
     size_t work_amount;
     bool use_runtime_ptrs;
+    bool do_output_saturation;
 };
 
 struct jit_eltwise_call_args_indexes {
diff --git a/src/plugins/intel_cpu/src/nodes/memory.cpp b/src/plugins/intel_cpu/src/nodes/memory.cpp
index 5a0bd7a1e3dff1..d9c9dba5a1219d 100644
--- a/src/plugins/intel_cpu/src/nodes/memory.cpp
+++ b/src/plugins/intel_cpu/src/nodes/memory.cpp
@@ -10,8 +10,11 @@
 #include "dnnl_extension_utils.h"
 #include "dnnl_types.h"
 #include "memory_desc/cpu_memory_desc_utils.h"
+#include "nodes/common/cpu_convert.h"
 #include "scaled_attn.h"
+#include "shape_inference/shape_inference_internal_dyn.hpp"
 #include "shape_inference/shape_inference_pass_through.hpp"
+#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp"
 #include "utils/general_utils.h"
 
 using namespace dnnl;
@@ -373,8 +376,10 @@ bool MemoryInputBase::isSupportedOperation(const std::shared_ptr<const ov::Node>
     try {
         if (!one_of(op->get_type_info(),
                     ov::op::v3::ReadValue::get_type_info_static(),
-                    ov::op::v6::ReadValue::get_type_info_static())) {
-            errorMessage = "Node is not an instance of ReadValue from the operation set v3 or v6.";
+                    ov::op::v6::ReadValue::get_type_info_static(),
+                    ov::intel_cpu::ReadValueWithSubgraph::get_type_info_static())) {
+            errorMessage = "Node is not an instance of ReadValue from the operation set v3 "
+                           "or v6, or is not an instance of intel_cpu::ReadValueWithSubgraph";
             return false;
         }
     } catch (...) {
@@ -402,22 +407,26 @@ MemoryInputBase::MemoryInputBase(const std::string id,
                                  const Shape& output_shape,
                                  const ov::element::Type& output_prc,
                                  const GraphContext::CPtr context,
-                                 const ov::optional<Shape>& input_shape,
-                                 const ov::optional<ov::element::Type>& input_prc,
+                                 const ov::optional<std::vector<Shape>>& input_shape,
+                                 const ov::optional<std::vector<ov::element::Type>>& input_prc,
                                  MemoryInputBase::mode mode)
     : Input(output_shape, output_prc, name, type, context),
       MemoryStateNode(id) {
     outputShapes.emplace_back(output_shape);
     addOriginalOutputPrecision(output_prc);
     if (input_shape) {
-        inputShapes.push_back(*input_shape);
-        isDynamic = isDynamic || input_shape->isDynamic();
+        for (auto inp_shape : *input_shape) {
+            inputShapes.push_back(inp_shape);
+            isDynamic = isDynamic || inp_shape.isDynamic();
+        }
         if (isDynamic && !shapeInference) {
             shapeInference = PassThroughShapeInferFactory().makeShapeInfer();
         }
     }
     if (input_prc) {
-        addOriginalInputPrecision(*input_prc);
+        for (auto inp_prc : *input_prc) {
+            addOriginalInputPrecision(inp_prc);
+        }
     }
     if (created()) {
         context->getMemoryStatesRegister()->registerInput(this);
@@ -456,8 +465,11 @@ void MemoryInputBase::initSupportedPrimitiveDescriptors() {
     NodeConfig config;
 
     if (!getParentEdges().empty()) {
-        const auto& inputShape = getInputShapeAtPort(0);
-        config.inConfs.emplace_back(descCreators.at(LayoutType::ncsp)->createSharedDesc(precision, inputShape));
+        for (size_t i = 0; i < getParentEdges().size(); i++) {
+            const auto& inputShape = getInputShapeAtPort(i);
+            auto inp_prc = getOriginalInputPrecisionAtPort(i);
+            config.inConfs.emplace_back(descCreators.at(LayoutType::ncsp)->createSharedDesc(inp_prc, inputShape));
+        }
     }
 
     const auto& outputShape = getOutputShapeAtPort(0);
@@ -562,6 +574,47 @@ void MemoryInputBase::bypassAssignState() {
     return;
 }
 
+MemoryInput::MemoryInput(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr ctx)
+    : MemoryInputBase::MemoryInputBase(op, ctx) {
+    auto rvWithSubgraph = ov::as_type_ptr<ov::intel_cpu::ReadValueWithSubgraph>(op);
+    if (rvWithSubgraph) {
+        body = rvWithSubgraph->get_function();
+        subGraph = make_unique<ov::intel_cpu::Graph>();
+        if (isDynamic) {
+            shapeInference = InternalDynShapeInferFactory().makeShapeInfer();
+        }
+    }
+}
+
+MemoryInput::MemoryInput(const std::string id,
+                         const std::string& name,
+                         const std::string& type,
+                         const Shape& output_shape,
+                         const ov::element::Type& output_prc,
+                         const GraphContext::CPtr context,
+                         const ov::optional<std::vector<Shape>>& input_shape,
+                         const ov::optional<std::vector<ov::element::Type>>& input_prc,
+                         std::shared_ptr<ov::Model> func,
+                         mode mode)
+    : MemoryInputBase::MemoryInputBase(id,
+                                       name,
+                                       type,
+                                       output_shape,
+                                       output_prc,
+                                       context,
+                                       input_shape,
+                                       input_prc,
+                                       mode) {
+    body = func;
+
+    if (haveSubgraph()) {
+        subGraph = make_unique<ov::intel_cpu::Graph>();
+        if (isDynamic) {
+            shapeInference = InternalDynShapeInferFactory().makeShapeInfer();
+        }
+    }
+}
+
 bool MemoryInput::needInitGraphProcessing() const {
     return !getParentEdges().empty() && getAssignedState()->is_reset_state();
 }
@@ -620,6 +673,59 @@ void MemoryInput::initOptimalPrimitiveDescriptor() {
     config.outConfs.front().setMemDesc(mem_desc);
     // bypass any checks, we enforce the child descriptor
     selectedPd->setConfig(config);
+
+    if (haveSubgraph()) {
+        // Adopt parent configuration, avoid to insert reorder before the MemoryInput.
+        std::vector<Input::InputConfig> graphInputConfig;
+
+        for (size_t i = 0; i < getParentEdges().size(); i++) {
+            auto desc = getParentOutputMemDesc(getParentEdgeAt(i));
+            graphInputConfig.emplace_back(node::Input::InputConfig{desc, true});
+        }
+
+        std::vector<Input::OutputConfig> graphOutputConfig;
+        for (auto&& portConfig : config.outConfs) {
+            auto desc = portConfig.getMemDesc();
+            graphOutputConfig.emplace_back(node::Input::OutputConfig{desc, true});
+        }
+
+        // configure the inner graph to get the information about output memory descriptors
+        subGraph->Init(body, context, graphInputConfig, graphOutputConfig);
+    }
+}
+
+// @todo add ascii diagramm for memory mapping / reuse
+void MemoryInput::createPrimitive() {
+    MemoryInputBase::createPrimitive();
+    if (haveSubgraph()) {
+        OPENVINO_ASSERT(getOriginalInputsNumber() == subGraph->inputsNumber(),
+                        "Number of node inputs must be equal the number of inner graph's inputs: ",
+                        getOriginalInputsNumber(),
+                        " != ",
+                        subGraph->inputsNumber());
+
+        std::vector<MemoryPtr> inputMemory;
+        for (size_t i = 0; i < getOriginalInputsNumber(); i++) {
+            auto srcEdgeMem = getSrcMemoryAtPort(i);
+            // create a separate input memory objects instead of share them. avoid data corruption.
+            auto mem = std::make_shared<Memory>(getEngine(), srcEdgeMem->getDescPtr(), srcEdgeMem->getMemoryBlock());
+            subgraphMemoryPtrs.push_back(mem);
+            inputMemory.emplace_back(std::move(mem));
+        }
+
+        OPENVINO_ASSERT(getOriginalOutputsNumber() == subGraph->outputsNumber(),
+                        "Number of node outputs must be equal the number of inner graph's outputs: ",
+                        getOriginalOutputsNumber(),
+                        " != ",
+                        subGraph->outputsNumber());
+
+        std::vector<MemoryPtr> outputMemory;
+        for (size_t i = 0; i < getOriginalOutputsNumber(); i++) {
+            outputMemory.emplace_back(getDstMemoryAtPort(i));
+        }
+
+        subGraph->Activate(inputMemory, outputMemory);
+    }
 }
 
 void MemoryInput::runDynamic(dnnl::stream strm) {
@@ -655,13 +761,43 @@ void MemoryInput::runDynamic(dnnl::stream strm) {
         memBlock->reset();
     }
 
-    // reshape output
-    const auto& newDims = processInitGraph ? getSrcMemoryAtPort(0)->getStaticDims() : stateDims;
+    MemoryPtr src = assignedMem;  // declare src memory
+    if (processInitGraph) {
+        if (haveSubgraph()) {
+            // put PrepareParams into runDynamic, because init graph is not called each time.
+            for (size_t i = 0; i < getOriginalInputsNumber(); i++) {
+                // since the external and internal descriptors are compatible, we may pass the descriptor
+                subgraphMemoryPtrs[i]->redefineDesc(getSrcMemoryAtPort(i)->getDescPtr());
+            }
+
+            subGraph->ResetInferCount();
+            subGraph->Infer();
+            // depending on the memory sharing solution, we can return here if the memory is substituted from the
+            // external graph or override the src pointer with the memory pointer pointing to the subgraph output
+            // memory
+            OPENVINO_ASSERT(subGraph->outputsNumber() == 1);
+            src = subGraph->getOutputNodeByIndex(0)->getSrcMemoryAtPort(0);
+
+            // since the shape inference(InternalDynShapeInfer, do nothing) is performed, a memory of the extra child
+            // edges, attached to the output ports has to be updated after an inference of the inner graph finished
+            auto& childEdges = getChildEdges();
+            for (size_t j = 1; j < childEdges.size(); j++) {
+                auto& childEdge = childEdges[j];
+                auto childEdgePtr = childEdge.lock();
+                assert(childEdgePtr);
+                assert(0 == childEdgePtr->getInputNum());
+                childEdgePtr->getMemoryPtr()->redefineDesc(src->getDescPtr());
+            }
+        } else {
+            src = getSrcMemoryAtPort(0);
+        }
+    }
 
+    // reshape output
+    const auto& newDims = src->getStaticDims();
     redefineOutputMemory(0, newDims);
 
     // copy data when necessary
-    auto src = processInitGraph ? getSrcMemoryAtPort(0) : assignedMem;
     if (src->getData() != dst->getData()) {
         dst->load(*src);
     }
@@ -692,10 +828,21 @@ void MemoryInput::runStatic(dnnl::stream strm) {
         memBlock->reset();
     }
 
-    const auto processInitGraph = needInitGraphProcessing();
+    const bool processInitGraph = needInitGraphProcessing();
+    MemoryPtr src = assignedMem;  // declare src memory
+    if (processInitGraph) {
+        if (haveSubgraph()) {
+            subGraph->ResetInferCount();
+            subGraph->Infer();
+
+            OPENVINO_ASSERT(subGraph->outputsNumber() == 1);
+            src = subGraph->getOutputNodeByIndex(0)->getSrcMemoryAtPort(0);
+        } else {
+            src = getSrcMemoryAtPort(0);
+        }
+    }
 
     // copy data when necessary
-    auto src = processInitGraph ? getSrcMemoryAtPort(0) : assignedMem;
     auto dst = getDstMemoryAtPort(0);
     if (src->getData() != dst->getData()) {
         dst->load(*src);
@@ -749,6 +896,10 @@ MemStatePtr MemoryInput::makeState() const {
                                                        original_desc);
 }
 
+std::shared_ptr<ov::Model> MemoryInput::getSubGraph() {
+    return body;
+}
+
 bool MemoryInput::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
     return MemoryInputBase::isSupportedOperation(op, errorMessage);
 }
@@ -759,8 +910,8 @@ MemoryInputSDPA::MemoryInputSDPA(const std::string id,
                                  const Shape& output_shape,
                                  const ov::element::Type& output_prc,
                                  const GraphContext::CPtr context,
-                                 const ov::optional<Shape>& input_shape,
-                                 const ov::optional<ov::element::Type>& input_prc,
+                                 const ov::optional<std::vector<Shape>>& input_shape,
+                                 const ov::optional<std::vector<ov::element::Type>>& input_prc,
                                  const std::shared_ptr<ScaledDotProductAttention>& sdpaNode)
     : MemoryInputBase(id, name, type, output_shape, output_prc, context, input_shape, input_prc),
       m_sdpaNode(sdpaNode) {}
@@ -865,8 +1016,9 @@ MemoryInputSingle::MemoryInputSingle(const std::string id,
                                      const Shape& output_shape,
                                      const ov::element::Type& output_prc,
                                      const GraphContext::CPtr context,
-                                     const ov::optional<Shape>& input_shape,
-                                     const ov::optional<ov::element::Type>& input_prc)
+                                     const ov::optional<std::vector<Shape>>& input_shape,
+                                     const ov::optional<std::vector<ov::element::Type>>& input_prc,
+                                     std::shared_ptr<ov::Model> func)
     : MemoryInput(id,
                   name,
                   type,
@@ -875,6 +1027,7 @@ MemoryInputSingle::MemoryInputSingle(const std::string id,
                   context,
                   input_shape,
                   input_prc,
+                  func,
                   MemoryInputBase::mode::single_read_value) {}
 
 MemStatePtr MemoryInputSingle::makeState() const {
diff --git a/src/plugins/intel_cpu/src/nodes/memory.hpp b/src/plugins/intel_cpu/src/nodes/memory.hpp
index 9c0c9664ce8a27..1d40849b0f3356 100644
--- a/src/plugins/intel_cpu/src/nodes/memory.hpp
+++ b/src/plugins/intel_cpu/src/nodes/memory.hpp
@@ -4,6 +4,8 @@
 
 #pragma once
 
+#include <graph.h>
+
 #include <map>
 
 #include "input.h"
@@ -162,8 +164,8 @@ class MemoryInputBase : public Input, public MemoryStateNode {
                     const Shape& output_shape,
                     const ov::element::Type& output_prc,
                     const GraphContext::CPtr context,
-                    const ov::optional<Shape>& input_shape,
-                    const ov::optional<ov::element::Type>& input_prc,
+                    const ov::optional<std::vector<Shape>>& input_shape,
+                    const ov::optional<std::vector<ov::element::Type>>& input_prc,
                     mode mode = mode::read_value_assign);
 
 protected:
@@ -192,15 +194,30 @@ class MemoryInputBase : public Input, public MemoryStateNode {
 
 class MemoryInput : public MemoryInputBase {
 public:
-    using MemoryInputBase::MemoryInputBase;
+    MemoryInput(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr ctx);
+    MemoryInput(const std::string id,
+                const std::string& name,
+                const std::string& type,
+                const Shape& output_shape,
+                const ov::element::Type& output_prc,
+                const GraphContext::CPtr context,
+                const ov::optional<std::vector<Shape>>& input_shape,
+                const ov::optional<std::vector<ov::element::Type>>& input_prc,
+                std::shared_ptr<ov::Model> func = nullptr,
+                mode mode = mode::read_value_assign);
+
     static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
 
     void initOptimalPrimitiveDescriptor() override;
 
     void resolveInPlaceEdges(Edge::LOOK look) override;
 
+    void createPrimitive() override;
+
     MemStatePtr makeState() const override;
 
+    std::shared_ptr<ov::Model> getSubGraph();
+
 protected:
     bool needInitGraphProcessing() const;
     void runStatic(dnnl::stream strm) override;
@@ -210,7 +227,15 @@ class MemoryInput : public MemoryInputBase {
     void assignStateHook() override { /*pass*/
     }
 
+    bool haveSubgraph() const {
+        return body != nullptr;
+    }
+
 private:
+    std::shared_ptr<ov::Model> body = nullptr;
+    std::unique_ptr<ov::intel_cpu::Graph> subGraph = nullptr;
+    std::vector<MemoryPtr> subgraphMemoryPtrs;
+
     ProxyMemoryBlockPtr memBlock = nullptr;
 };
 
@@ -222,8 +247,9 @@ class MemoryInputSingle : public MemoryInput {
                       const Shape& output_shape,
                       const ov::element::Type& output_prc,
                       const GraphContext::CPtr context,
-                      const ov::optional<Shape>& input_shape,
-                      const ov::optional<ov::element::Type>& input_prc);
+                      const ov::optional<std::vector<Shape>>& input_shape,
+                      const ov::optional<std::vector<ov::element::Type>>& input_prc,
+                      std::shared_ptr<ov::Model> func);
 
     static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
 
@@ -242,8 +268,8 @@ class MemoryInputSDPA : public MemoryInputBase {
                     const Shape& output_shape,
                     const ov::element::Type& output_prc,
                     const GraphContext::CPtr context,
-                    const ov::optional<Shape>& input_shape,
-                    const ov::optional<ov::element::Type>& input_prc,
+                    const ov::optional<std::vector<Shape>>& input_shape,
+                    const ov::optional<std::vector<ov::element::Type>>& input_prc,
                     const std::shared_ptr<ScaledDotProductAttention>& sdpaNode);
 
     static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
diff --git a/src/plugins/intel_cpu/src/nodes/memory_state_base.cpp b/src/plugins/intel_cpu/src/nodes/memory_state_base.cpp
index 58d855a091d716..19d4863c3afbcb 100644
--- a/src/plugins/intel_cpu/src/nodes/memory_state_base.cpp
+++ b/src/plugins/intel_cpu/src/nodes/memory_state_base.cpp
@@ -11,10 +11,8 @@
 using namespace ov::intel_cpu::node;
 
 MemoryNode::MemoryNode(const std::shared_ptr<ov::Node>& op) {
-    if (auto assignOp = ov::as_type_ptr<ov::op::util::AssignBase>(op)) {
+    if (auto assignOp = std::dynamic_pointer_cast<ov::op::util::VariableExtension>(op)) {
         m_id = assignOp->get_variable_id();
-    } else if (auto readValueOp = ov::as_type_ptr<ov::op::util::ReadValueBase>(op)) {
-        m_id = readValueOp->get_variable_id();
     } else {
         OPENVINO_THROW("Unexpected ov::Node type: ", op->get_type_info().name, " in MemoryNode");
     }
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index db55c728df725e..b3c2aa0b298a5a 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -218,6 +218,8 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
                                                                            ov::element::Type_t::i4,
                                                                            ov::element::Type_t::u8,
                                                                            ov::element::Type_t::i8,
+                                                                           ov::element::Type_t::f8e4m3,
+                                                                           ov::element::Type_t::f8e5m2,
                                                                            ov::element::Type_t::u16,
                                                                            ov::element::Type_t::i16,
                                                                            ov::element::Type_t::u32,
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.cpp
new file mode 100644
index 00000000000000..39df4b6a29c099
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "read_value_with_subgraph.hpp"
+
+#include "itt.hpp"
+#include "transformations/itt.hpp"
+
+ov::intel_cpu::ReadValueWithSubgraph::ReadValueWithSubgraph(const std::shared_ptr<ov::op::util::Variable>& variable,
+                                                            std::shared_ptr<ov::Model> body) {
+    m_variable = variable;
+    set_function(body);
+}
+
+ov::intel_cpu::ReadValueWithSubgraph::ReadValueWithSubgraph(const std::shared_ptr<ov::op::util::Variable>& variable,
+                                                            std::shared_ptr<ov::Model> body,
+                                                            const OutputVector& args)
+    : ReadValueWithSubgraph(variable, body) {
+    set_arguments(args);
+}
+
+std::string ov::intel_cpu::ReadValueWithSubgraph::get_variable_id() const {
+    OPENVINO_ASSERT(m_variable, "Variable is not initialized. Variable_id is unavailable");
+    return get_variable()->get_info().variable_id;
+}
+
+void ov::intel_cpu::ReadValueWithSubgraph::set_input(const Output<Node>& value,
+                                                     const std::shared_ptr<op::v0::Parameter>& body_parameter) {
+    OPENVINO_ASSERT(body_parameter != nullptr, "Missing parameter! parameter is is nullptr!");
+    auto param_index = m_bodies[0]->get_parameter_index(body_parameter);
+
+    OPENVINO_ASSERT(param_index != -1, "Missing parameter ", body_parameter->get_friendly_name(), " for \'body\'!");
+
+    set_invariant_inputs(value, {body_parameter});
+}
+
+ov::Output<ov::Node> ov::intel_cpu::ReadValueWithSubgraph::set_output(
+    const std::shared_ptr<op::v0::Result>& body_result) {
+    OPENVINO_ASSERT(body_result != nullptr, "Incorrect result in \"body\"! Result cant be \'nullptr\'");
+    auto result_id = m_bodies[0]->get_result_index(body_result);
+
+    OPENVINO_ASSERT(result_id != -1, "Missing result ", body_result->get_friendly_name(), "in \'body\'!");
+
+    return set_body_outputs({body_result});
+}
+
+std::shared_ptr<ov::Node> ov::intel_cpu::ReadValueWithSubgraph::clone_with_new_inputs(
+    const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(intel_cpu_ReadValueWithSubgraphNode_clone_with_new_inputs);
+
+    check_new_args_count(this, new_args);
+    auto op =
+        std::make_shared<ov::intel_cpu::ReadValueWithSubgraph>(this->get_variable(), get_function()->clone(), new_args);
+    OPENVINO_ASSERT(op.get(),
+                    op != nullptr,
+                    "Cannot clone ",
+                    description(),
+                    " operation with name ",
+                    get_friendly_name());
+    op->set_output_size(m_output_descriptions[0].size());
+    for (const auto& m_input_descr : m_input_descriptions[0]) {
+        op->m_input_descriptions[0].push_back(m_input_descr->copy());
+    }
+    for (const auto& m_output_descr : m_output_descriptions[0]) {
+        op->m_output_descriptions[0].push_back(m_output_descr->copy());
+    }
+    op->validate_and_infer_types();
+    return op;
+}
+
+bool ov::intel_cpu::ReadValueWithSubgraph::visit_attributes(AttributeVisitor& visitor) {
+    INTERNAL_OP_SCOPE(intel_cpu_ReadValueWithSubgraphNode_visit_attributes);
+    visitor.on_attribute("variable_id", m_variable);
+
+    auto variable_info = m_variable->get_info();
+    visitor.on_attribute("variable_type", variable_info.data_type);
+    visitor.on_attribute("variable_shape", variable_info.data_shape);
+    m_variable->update(variable_info);
+
+    visitor.on_attribute("body", m_bodies[0]);
+    visitor.on_attribute("inputs", m_input_descriptions[0]);
+    visitor.on_attribute("outputs", m_output_descriptions[0]);
+    return true;
+}
+
+void ov::intel_cpu::ReadValueWithSubgraph::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(intel_cpu_ReadValueWithSubgraphNode_validate_and_infer_types);
+
+    NODE_VALIDATION_CHECK(this,
+                          m_bodies.size() == 1,
+                          "ReadValueWithSubgraph contains incorrect number of bodies:",
+                          m_bodies.size());
+
+    validate_and_infer_type_body(get_function(), m_input_descriptions[0]);
+
+    auto output_nodes = outputs();
+
+    auto outputs_map = get_mapping_outputs_on_body_description(m_output_descriptions[0]);
+
+    // Checking each output
+    for (size_t output_index = 0; output_index < output_nodes.size(); ++output_index) {
+        NODE_VALIDATION_CHECK(this,
+                              outputs_map.count(output_index) != 0,
+                              "Incorrect associating in body! Output ",
+                              output_index,
+                              " is not associated with results in then_body!");
+
+        auto desc = outputs_map.at(output_index);
+
+        auto node_result = m_bodies[0]->get_results().at(desc->m_body_value_index)->input_value(0);
+
+        set_output_type(output_index, node_result.get_element_type(), node_result.get_partial_shape());
+    }
+}
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.hpp
new file mode 100644
index 00000000000000..037f8eb302afcd
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/read_value_with_subgraph.hpp
@@ -0,0 +1,37 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+#include "openvino/op/read_value.hpp"
+#include "openvino/op/util/sub_graph_base.hpp"
+#include "transformations/cpu_opset/common/op/submodel.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class ReadValueWithSubgraph : public ov::op::util::SubGraphOp, public ov::op::util::VariableExtension {
+public:
+    OPENVINO_OP("ReadValueWithSubgraph", "cpu_plugin_opset");
+
+    ReadValueWithSubgraph() = default;
+    ReadValueWithSubgraph(const std::shared_ptr<ov::op::util::Variable>& variable, std::shared_ptr<ov::Model> body);
+    ReadValueWithSubgraph(const std::shared_ptr<ov::op::util::Variable>& variable,
+                          std::shared_ptr<ov::Model> body,
+                          const OutputVector& args);
+
+    std::string get_variable_id() const override;
+
+    void set_input(const Output<Node>& value, const std::shared_ptr<op::v0::Parameter>& body_parameter);
+
+    Output<Node> set_output(const std::shared_ptr<op::v0::Result>& body_result);
+
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.cpp
new file mode 100644
index 00000000000000..e2b283e65c8615
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.cpp
@@ -0,0 +1,164 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "move_readvalue_inputs_to_subgraph.hpp"
+
+#include <unordered_set>
+
+#include "itt.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "openvino/pass/constant_folding.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "ov_ops/rotary_positional_embeddings.hpp"
+#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp"
+#include "transformations/cpu_opset/common/op/sdpa.hpp"
+#include "transformations/cpu_opset/common/op/submodel.hpp"
+#include "transformations/rt_info/disable_fp16_compression.hpp"
+#include "transformations/utils/gen_pattern.hpp"
+#include "transformations/utils/utils.hpp"
+
+ov::intel_cpu::MoveReadValueInputsToSubgraph::MoveReadValueInputsToSubgraph() {
+    MATCHER_SCOPE(MoveReadValueInputsToSubgraph);
+    using namespace ov::pass::pattern;
+
+    auto readvalue_pattern = pass::pattern::wrap_type<ov::op::v6::ReadValue>();
+
+    ov::matcher_pass_callback callback = [=](Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto readvalue = as_type_ptr<ov::opset6::ReadValue>(pattern_map.at(readvalue_pattern).get_node_shared_ptr());
+        if (!readvalue || readvalue->get_input_size() != 1u) {
+            return false;
+        }
+
+        if (readvalue->get_rt_info().count("DisableInitSubgraphFusing") &&
+            readvalue->get_rt_info()["DisableInitSubgraphFusing"].as<bool>()) {
+            return false;
+        }
+
+        NodeVector subgraph_nodes;
+        std::unordered_set<std::shared_ptr<ov::Node>> visited_path_to_output;  // Cache nodes which connect to Output.
+        std::unordered_set<std::shared_ptr<ov::Node>> visited_path_to_rv;  // Cache nodes which connect to ReadValue.
+        NodeVector inputs = {};
+        OutputVector outputs = {};
+
+        // DFS, Check if current node's final successor is only ReadValue.
+        std::function<void(std::shared_ptr<ov::Node>, bool&)> dfs = [&](std::shared_ptr<ov::Node> node,
+                                                                        bool& found_output) {
+            if (found_output) {
+                return;
+            }
+
+            if (visited_path_to_output.find(node) != visited_path_to_output.end()) {
+                found_output = true;
+                return;
+            }
+
+            if (visited_path_to_rv.find(node) != visited_path_to_rv.end()) {
+                return;
+            }
+
+            // node is Output
+            if (node->get_output_target_inputs(0).size() == 0u) {
+                found_output = true;
+                return;
+            }
+
+            bool any_child_on_output_path = false;
+            for (const auto& child : node->get_output_target_inputs(0)) {
+                auto son = child.get_node()->shared_from_this();
+                if (son == readvalue) {
+                    continue;
+                }
+
+                bool new_found_output = false;
+                dfs(son, new_found_output);
+                if (new_found_output) {
+                    any_child_on_output_path = true;
+                }
+            }
+
+            if (any_child_on_output_path) {
+                visited_path_to_output.insert(node);
+                found_output = any_child_on_output_path;
+            }
+        };
+
+        std::function<void(std::shared_ptr<ov::Node>)> reverse_dfs = [&](std::shared_ptr<ov::Node> node) {
+            if (visited_path_to_output.find(node) != visited_path_to_output.end()) {
+                inputs.emplace_back(node);
+                return;
+            }
+
+            if (visited_path_to_rv.find(node) != visited_path_to_rv.end()) {
+                return;
+            }
+
+            if (ov::op::util::is_parameter(node)) {
+                inputs.emplace_back(node);
+                return;
+            }
+
+            // Check if the current node has path(bypassing the ReadValue node) to the Output node via dfs algorithm.
+            bool found_output = false;  // Flag: find Output node
+            dfs(node, found_output);
+
+            if (found_output) {
+                inputs.emplace_back(node);
+                visited_path_to_output.insert(node);
+                return;
+            }
+
+            visited_path_to_rv.insert(node);
+
+            // Cache to subgraph_nodes
+            subgraph_nodes.emplace_back(node);
+
+            for (size_t i = 0; i < node->get_input_size(); i++) {
+                reverse_dfs(node->get_input_node_shared_ptr(i));
+            }
+        };
+
+        // Reverse DFS ReadValue, find all suitable nodes and move them to subgraph_nodes.
+        reverse_dfs(readvalue->get_input_node_shared_ptr(0));
+
+        if (inputs.size() == 0 || subgraph_nodes.size() == 0) {
+            return false;
+        }
+
+        // Subgraph's input
+        auto params = ParameterVector{};
+        for (auto inp : inputs) {
+            auto param =
+                std::make_shared<ov::op::v0::Parameter>(inp->get_element_type(), inp->get_output_partial_shape(0));
+            params.push_back(param);
+            for (const auto& child : inp->get_output_target_inputs(0)) {
+                auto it = std::find(subgraph_nodes.begin(), subgraph_nodes.end(), child.get_node()->shared_from_this());
+                if (it != subgraph_nodes.end()) {
+                    child.replace_source_output(param);
+                }
+            }
+        }
+
+        // Subgraph's output
+        auto last_node = readvalue->get_input_node_shared_ptr(0);
+        auto output = std::make_shared<ov::op::v0::Result>(last_node);
+        auto func = std::make_shared<Model>(ov::ResultVector({output}), params, "state_init_submodel");
+
+        auto new_rv = std::make_shared<ov::intel_cpu::ReadValueWithSubgraph>(readvalue->get_variable(), func);
+
+        for (size_t i = 0; i < inputs.size(); i++) {
+            new_rv->set_input(inputs[i]->output(0), params[i]);
+        }
+        new_rv->set_output(output);
+
+        // Replace ReadValue with ov::intel_cpu::ReadValueWithSubgraph
+        ov::replace_node(readvalue, new_rv);
+        ov::copy_runtime_info(subgraph_nodes, new_rv);
+        new_rv->validate_and_infer_types();
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(readvalue_pattern, matcher_name);
+    this->register_matcher(m, callback);
+}
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.hpp
new file mode 100644
index 00000000000000..220003cc83ead1
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+/**
+ * @brief Move ReadValue's inputs inside the new CPU ngraph node:ReadValueWithSubgraph op.
+ *     intput1
+ *        |
+ *     Some nodes(They have only one common successor[ReadValue])     input1
+ *        |                                                             |
+ *     ReadValue                                            ------->  ReadValueWithSubgraph(Subgraph is inside)
+ *        |     \                                                       |          \
+ *     Assign   others                                                Assign       others
+ */
+
+class MoveReadValueInputsToSubgraph : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("MoveReadValueInputsToSubgraph", "0");
+    MoveReadValueInputsToSubgraph();
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index 0ec2049d1ccc1c..447adb0b2fe23f 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -270,6 +270,13 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
         else
             assign_v_node->set_arguments({new_node->output(2)});
 
+        // Markup pattern:
+        // ReadValue->Convert(Optional)->ScaledDotProductAttentionWithKVCache->Convert(Optional)->Assign, so that
+        // ReadValue can't be replaced with ReadValueWithSubgraph in this pattern.
+        // TODO: Temporarily skip this pattern. If MemoryInputSDPA supports Subgraph in the future, it may be deleted.
+        past_k_node->get_rt_info()["DisableInitSubgraphFusing"] = true;
+        past_v_node->get_rt_info()["DisableInitSubgraphFusing"] = true;
+
         return true;
     };
 
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
index 614f7d690f8726..5142ee319ac523 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
@@ -10,6 +10,7 @@
 #include "common/pass/convert_to_swish_cpu.hpp"
 #include "common/pass/fc_bias_fusion.hpp"
 #include "common/pass/move_fc_reshape_to_weights.hpp"
+#include "common/pass/move_readvalue_inputs_to_subgraph.hpp"
 #include "common/pass/rnn_sequences_optimization.hpp"
 #include "config.h"
 #include "itt.hpp"
@@ -70,6 +71,7 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ov::Model>& model, const C
                              false);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::EliminateConvert);  // Need to clean up after the ConvertPrecision.
+    CPU_REGISTER_PASS_COMMON(manager, MoveReadValueInputsToSubgraph);
 
     manager.run_passes(model);
 }
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index a63377312ecb95..fb9e0925bc89e2 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -37,6 +37,7 @@
 #include "transformations/common_optimizations/nop_elimination.hpp"
 #include "transformations/common_optimizations/reshape_prelu.hpp"
 #include "transformations/common_optimizations/rms_fusion.hpp"
+#include "transformations/common_optimizations/sdpa_fusion.hpp"
 #include "transformations/common_optimizations/transpose_sinking.hpp"
 #include "transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp"
 #include "transformations/common_optimizations/wrap_interpolate_into_transposes.hpp"
@@ -695,6 +696,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     CPU_DISABLE_PASS_COMMON(manager, ov::pass::MatMulConstTransposesExtraction);
     CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertScatterNDUpdate15ToScatterNDUpdate3);
     CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertSliceScatter);
+    CPU_DISABLE_PASS_COMMON(manager, ov::pass::SDPAFusion);
     CPU_DISABLE_PASS_X64(manager, ov::pass::HSigmoidDecomposition);
 
     CPU_DISABLE_PASS_X64(manager, ov::pass::ReduceL1Decomposition);
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp
index 4989fb3a0f04b7..a3c1f9ef7d3544 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp
@@ -16,11 +16,45 @@ using namespace CPUTestUtils;
 namespace ov {
 namespace test {
 
+static std::string special_value_to_string(const ov::test::SpecialValue& value) {
+    if (value == SpecialValue::none) {
+        return "none";
+    } else if (value == SpecialValue::nan) {
+        return "nan";
+    } else if (value == SpecialValue::inf) {
+        return "inf";
+    } else if (value == SpecialValue::overflow) {
+        return "overflow";
+    }
+    return "unknown";
+}
+
+template <typename T>
+static T set_special_value(T& value, const ov::test::SpecialValue& special_value) {
+    if (special_value == ov::test::SpecialValue::nan) {
+        value = NAN;
+    } else if (special_value == ov::test::SpecialValue::inf) {
+        value = INFINITY;
+    } else if (special_value == ov::test::SpecialValue::overflow) {
+        value = value + std::numeric_limits<ov::float8_e5m2>::max();
+    }
+    return value;
+}
+
+template <typename T>
+static void modify_value(ov::Tensor& tensor, const ov::test::SpecialValue& special_value) {
+    T* dataPtr = static_cast<T*>(tensor.data());
+    for (size_t i = 0; i < tensor.get_size(); i++) {
+        set_special_value<T>(dataPtr[i], special_value);
+    }
+}
+
 std::string ConvertCPULayerTest::getTestCaseName(testing::TestParamInfo<convertLayerTestParamsSet> obj) {
     InputShape inputShape;
     ov::element::Type inPrc, outPrc;
+    ov::test::SpecialValue special_value;
     CPUSpecificParams cpuParams;
-    std::tie(inputShape, inPrc, outPrc, cpuParams) = obj.param;
+    std::tie(inputShape, inPrc, outPrc, special_value, cpuParams) = obj.param;
 
     std::ostringstream result;
 
@@ -30,6 +64,7 @@ std::string ConvertCPULayerTest::getTestCaseName(testing::TestParamInfo<convertL
         result << ov::test::utils::vec2str(shape) << "_";
     }
     result << "inputPRC=" << inPrc.to_string() << "_";
+    result << "specialValue=" << special_value_to_string(special_value) << "_";
     result << "targetPRC=" << outPrc.to_string() << "_";
     result << CPUTestsBase::getTestCaseName(cpuParams);
 
@@ -47,7 +82,9 @@ bool ConvertCPULayerTest::isInOutPrecisionSupported(ov::element::Type inPrc, ov:
 #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     if ((inPrc == ov::element::i8 && outPrc == ov::element::u8) ||
         (inPrc == ov::element::u8 && outPrc == ov::element::i8) ||
-        (inPrc == ov::element::f32 && (outPrc == ov::element::u8 || outPrc == ov::element::i8)))
+        (inPrc == ov::element::f32 && (outPrc == ov::element::u8 || outPrc == ov::element::i8)) ||
+        (inPrc == ov::element::f8e4m3 || inPrc == ov::element::f8e5m2 ||
+        outPrc == ov::element::f8e4m3 || outPrc == ov::element::f8e5m2))
         return false;
 #endif
     return true;
@@ -58,14 +95,16 @@ void ConvertCPULayerTest::SetUp() {
 
     InputShape shapes;
     CPUSpecificParams cpuParams;
-    std::tie(shapes, inPrc, outPrc, cpuParams) = GetParam();
+    std::tie(shapes, inPrc, outPrc, special_value, cpuParams) = GetParam();
 
     std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
     auto primitive = selectedType;
     if (primitive.empty())
         primitive = getPrimitiveType();
 #if defined(OPENVINO_ARCH_ARM64)
-    if (inPrc == ov::element::u4 || inPrc == ov::element::i4) {
+    if (inPrc == ov::element::u4 || inPrc == ov::element::i4 ||
+        inPrc == ov::element::f8e4m3 || inPrc == ov::element::f8e5m2 ||
+        outPrc == ov::element::f8e4m3 || outPrc == ov::element::f8e5m2) {
         primitive = "ref";
     } else if (shapes.first.is_static() &&
         inPrc != ov::element::bf16 && outPrc != ov::element::bf16 &&
@@ -92,6 +131,12 @@ void ConvertCPULayerTest::SetUp() {
 
     inputDynamicShapes.push_back(shapes.first);
 
+    if (outPrc == ov::element::f16) {
+        configuration.insert(ov::hint::inference_precision(ov::element::f16));
+    } else if (outPrc == ov::element::bf16) {
+        configuration.insert(ov::hint::inference_precision(ov::element::bf16));
+    }
+
     ov::ParameterVector params;
     for (auto&& shape : inputDynamicShapes) {
         params.push_back(std::make_shared<ov::op::v0::Parameter>(inPrc, shape));
@@ -101,6 +146,31 @@ void ConvertCPULayerTest::SetUp() {
     function = makeNgraphFunction(inPrc, params, conversion, "ConversionCPU");
 }
 
+void ConvertCPULayerTest::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
+    inputs.clear();
+    const auto& funcInputs = function->inputs();
+    for (size_t i = 0; i < funcInputs.size(); ++i) {
+        const auto& funcInput = funcInputs[i];
+        ov::Tensor tensor =
+            ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]);
+        if (special_value != ov::test::SpecialValue::none) {
+            if (inPrc == ov::element::f32) {
+                modify_value<float>(tensor, special_value);
+            } else if (inPrc == ov::element::f16) {
+                modify_value<ov::float16>(tensor, special_value);
+            } else if (inPrc == ov::element::bf16) {
+                modify_value<ov::bfloat16>(tensor, special_value);
+            } else if (inPrc == ov::element::f8e4m3) {
+                modify_value<ov::float8_e4m3>(tensor, special_value);
+            } else if (inPrc == ov::element::f8e5m2) {
+                modify_value<ov::float8_e5m2>(tensor, special_value);
+            }
+        }
+
+        inputs.insert({funcInput.get_node_shared_ptr(), tensor});
+    }
+}
+
 void ConvertCPULayerTest::validate_out_prc() const {
     if (outPrc == ov::element::boolean)
         FAIL() << "ConvertCPULayerTest supports only non boolean output prc";
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp
index a53f56f873151c..a4f4e0fc56c238 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp
@@ -13,9 +13,12 @@ using namespace CPUTestUtils;
 
 namespace ov {
 namespace test {
+enum SpecialValue { none, nan, inf, overflow };
+
 using convertLayerTestParamsSet = std::tuple<InputShape,         // input shapes
                                              ov::element::Type,  // input precision
                                              ov::element::Type,  // output precision
+                                             SpecialValue,       // Specail value
                                              CPUSpecificParams>;
 
 class ConvertCPULayerTest : public testing::WithParamInterface<convertLayerTestParamsSet>,
@@ -25,9 +28,12 @@ class ConvertCPULayerTest : public testing::WithParamInterface<convertLayerTestP
     static bool isInOutPrecisionSupported(ov::element::Type inPrc, ov::element::Type outPrc);
 protected:
     void SetUp() override;
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
     virtual void validate_out_prc() const;
 
     ov::element::Type inPrc, outPrc;
+private:
+    ov::test::SpecialValue special_value;
 };
 
 class ConvertToBooleanCPULayerTest : public ConvertCPULayerTest {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp
index 11e0440b2e3618..e5d87f5cb2f3dd 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp
@@ -16,6 +16,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_7D_Dynamic, ConvertCPULayerTe
                                 ::testing::ValuesIn(inShapes_7D_dynamic()),
                                 ::testing::ValuesIn(precisions()),
                                 ::testing::ValuesIn(precisions()),
+                                ::testing::Values(ov::test::SpecialValue::none),
                                 ::testing::Values(CPUSpecificParams({}, {}, {}, {}))),
                         ConvertCPULayerTest::getTestCaseName);
 
@@ -24,6 +25,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_7D_Static, ConvertCPULayerTes
                                 ::testing::ValuesIn(inShapes_7D_static()),
                                 ::testing::ValuesIn(precisions()),
                                 ::testing::ValuesIn(precisions()),
+                                ::testing::Values(ov::test::SpecialValue::none),
                                 ::testing::Values(CPUSpecificParams({}, {}, {}, {}))),
                         ConvertCPULayerTest::getTestCaseName);
 
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp
index 59ca1065bf78d9..8181304bf95e7d 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp
@@ -31,6 +31,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_4D_Dynamic, ConvertCPULayerTe
                                 ::testing::ValuesIn(inShapes_4D_dynamic()),
                                 ::testing::ValuesIn(precisions()),
                                 ::testing::ValuesIn(precisions()),
+                                ::testing::Values(ov::test::SpecialValue::none),
                                 ::testing::ValuesIn(memForm4D_dynamic)),
                         ConvertCPULayerTest::getTestCaseName);
 
@@ -39,6 +40,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_4bit_Dynamic, ConvertCPULayer
                          ::testing::Combine(::testing::ValuesIn(inShapes_4D_dynamic()),
                                             ::testing::ValuesIn({ov::element::u4, ov::element::i4}),
                                             ::testing::ValuesIn({ov::element::f32, ov::element::bf16, ov::element::u8, ov::element::i8}),
+                                            ::testing::Values(ov::test::SpecialValue::none),
                                             ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {"ref"}))),
                          ConvertCPULayerTest::getTestCaseName);
 
@@ -52,9 +54,69 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_4D_Static, ConvertCPULayerTes
                                 ::testing::ValuesIn(inShapes_4D_static()),
                                 ::testing::ValuesIn(precisions()),
                                 ::testing::ValuesIn(precisions()),
+                                ::testing::Values(ov::test::SpecialValue::none),
                                 ::testing::ValuesIn(memForm4D_static_common)),
                         ConvertCPULayerTest::getTestCaseName);
 
+const std::vector<ov::element::Type> float_precisions = {
+    ov::element::f32,
+    ov::element::f16,
+    ov::element::bf16,
+};
+
+const std::vector<ov::element::Type> f8_precisions = {
+    ov::element::f8e4m3,
+    ov::element::f8e5m2,
+};
+
+const std::vector<ov::test::SpecialValue> specialValue = {
+    ov::test::SpecialValue::none,
+    ov::test::SpecialValue::nan,
+    ov::test::SpecialValue::inf,
+    ov::test::SpecialValue::overflow,
+};
+
+std::vector<CPUSpecificParams> memForm4D_fp8 = {
+    CPUSpecificParams({nchw}, {nchw}, {}, expectedPrimitiveType()),
+    CPUSpecificParams({nhwc}, {nhwc}, {}, expectedPrimitiveType()),
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_from_fp8_Static, ConvertCPULayerTest,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(inShapes_4D_static()),
+                                ::testing::ValuesIn(f8_precisions),
+                                ::testing::ValuesIn(float_precisions),
+                                ::testing::ValuesIn(specialValue),
+                                ::testing::ValuesIn(memForm4D_fp8)),
+                        ConvertCPULayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_to_fp8_Static, ConvertCPULayerTest,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(inShapes_4D_static()),
+                                ::testing::ValuesIn(float_precisions),
+                                ::testing::ValuesIn(f8_precisions),
+                                ::testing::ValuesIn(specialValue),
+                                ::testing::ValuesIn(memForm4D_fp8)),
+                        ConvertCPULayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_from_fp8_Dynamic, ConvertCPULayerTest,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(inShapes_4D_dynamic()),
+                                ::testing::ValuesIn(f8_precisions),
+                                ::testing::ValuesIn(float_precisions),
+                                ::testing::ValuesIn(specialValue),
+                                ::testing::ValuesIn(memForm4D_fp8)),
+                        ConvertCPULayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_to_fp8_Dynamic, ConvertCPULayerTest,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(inShapes_4D_dynamic()),
+                                ::testing::ValuesIn(float_precisions),
+                                ::testing::ValuesIn(f8_precisions),
+                                ::testing::ValuesIn(specialValue),
+                                ::testing::ValuesIn(memForm4D_fp8)),
+                        ConvertCPULayerTest::getTestCaseName);
+
 }  // namespace Conversion
 }  // namespace test
 }  // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp
index 9c34d6220d4b2d..ab1e06639c5a3e 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp
@@ -23,6 +23,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_blocked_Dynamic, ConvertCPULa
                                 ::testing::ValuesIn(inShapes_4D_dynamic()),
                                 ::testing::ValuesIn(precisions()),
                                 ::testing::ValuesIn(precisions()),
+                                ::testing::Values(ov::test::SpecialValue::none),
                                 ::testing::ValuesIn(memForm4D_dynamic)),
                         ConvertCPULayerTest::getTestCaseName);
 
@@ -44,6 +45,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_Blocked, ConvertCPULayerTest,
                                 ::testing::ValuesIn(inShapes_4D_blocked),
                                 ::testing::ValuesIn(precisions()),
                                 ::testing::ValuesIn(precisions()),
+                                ::testing::Values(ov::test::SpecialValue::none),
                                 ::testing::ValuesIn(filterCPUSpecificParams(memForm4D_static_blocked))),
                         ConvertCPULayerTest::getTestCaseName);
 
@@ -52,6 +54,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_BOOL_Static, ConvertToBoolean
                                 ::testing::ValuesIn(inShapes_4D_static()),
                                 ::testing::ValuesIn(precisions_floating_point),
                                 ::testing::Values(ov::element::boolean),
+                                ::testing::Values(ov::test::SpecialValue::none),
                                 ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {}))),
                         ConvertToBooleanCPULayerTest::getTestCaseName);
 
@@ -60,6 +63,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_BOOL_Dynamic, ConvertToBoolea
                                 ::testing::ValuesIn(inShapes_4D_dynamic()),
                                 ::testing::ValuesIn(precisions_floating_point),
                                 ::testing::Values(ov::element::boolean),
+                                ::testing::Values(ov::test::SpecialValue::none),
                                 ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {}))),
                         ConvertToBooleanCPULayerTest::getTestCaseName);
 
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/stateful_init_graph.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/stateful_init_graph.cpp
new file mode 100644
index 00000000000000..9186b43d3d863e
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/stateful_init_graph.cpp
@@ -0,0 +1,314 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <shared_test_classes/base/ov_subgraph.hpp>
+
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "utils/cpu_test_utils.hpp"
+
+using namespace ov::test;
+using namespace CPUTestUtils;
+using InitGraphStatefulModelTestParams = std::tuple<std::vector<InputShape>,  // input shapes
+                                                    bool                      // ReadValue Assgin Direct pair or not
+                                                    >;
+class InitGraphStatefulModelBase : virtual public ov::test::SubgraphBaseTest,
+                                   public testing::WithParamInterface<InitGraphStatefulModelTestParams>,
+                                   public CPUTestsBase {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<InitGraphStatefulModelTestParams>& obj) {
+        std::ostringstream result;
+
+        std::vector<InputShape> inputShapes;
+        bool directPair;
+        std::tie(inputShapes, directPair) = obj.param;
+
+        result << "IS=";
+        for (const auto& shape : inputShapes) {
+            result << ov::test::utils::partialShape2str({shape.first}) << "_";
+        }
+        result << "TS=";
+        for (const auto& shape : inputShapes) {
+            result << "(";
+            if (!shape.second.empty()) {
+                for (const auto& itr : shape.second) {
+                    result << ov::test::utils::vec2str(itr);
+                }
+            }
+            result << ")";
+        }
+        result << "_DirectAssign=" << ov::test::utils::bool2str(directPair);
+        result << ")";
+
+        return result.str();
+    }
+
+    std::vector<ov::Tensor> calculate_refs() override {
+        for (const auto& param : functionRefs->get_parameters()) {
+            inferRequestRef.set_tensor(param->get_default_output(), inputs.at(matched_parameters[param]));
+        }
+        inferRequestRef.infer();
+
+        auto outputs = std::vector<ov::Tensor>{};
+        for (const auto& output : functionRefs->outputs()) {
+            outputs.push_back(inferRequestRef.get_tensor(output));
+        }
+
+        return outputs;
+    }
+
+    std::vector<ov::Tensor> get_plugin_outputs() override {
+        for (const auto& input : inputs) {
+            inferRequest.set_tensor(input.first, input.second);
+        }
+        inferRequest.infer();
+        auto outputs = std::vector<ov::Tensor>{};
+        for (const auto& output : function->outputs()) {
+            outputs.push_back(inferRequest.get_tensor(output));
+        }
+        return outputs;
+    }
+
+    void run() override {
+        prepare();
+
+        auto&& states = inferRequest.query_state();
+        auto&& refStates = inferRequestRef.query_state();
+
+        for (size_t i = 0; i < targetStaticShapes.size(); i++) {
+            for (auto iters = 0; iters < 5; iters++) {
+                generate_inputs(targetStaticShapes[i]);
+
+                if (iters & 0x1) {
+                    states.front().reset();
+                    refStates.front().reset();
+                } else {
+                    // generate and set state tensors every even iteration
+                    using ov::test::utils::InputGenerateData;
+
+                    auto stateShape = get_state_shape(i);
+                    auto tensor = utils::create_and_fill_tensor(statePrc,
+                                                                stateShape,
+                                                                InputGenerateData{0, 1, 1, iters});
+                    states.front().set_state(tensor);
+                    refStates.front().set_state(tensor);
+                }
+
+                validate();
+            }
+        }
+    }
+
+protected:
+    virtual void check_init_graph_node() = 0;
+
+    virtual ov::Shape get_state_shape(size_t i) = 0;
+
+    void prepare() {
+        compile_model();
+
+        inferRequest = compiledModel.create_infer_request();
+        ASSERT_TRUE(inferRequest);
+
+        check_init_graph_node();
+
+        // ref
+        functionRefs = function->clone();
+
+        matched_parameters.clear();
+        const auto& ref_params = functionRefs->get_parameters();
+        const auto& params = function->get_parameters();
+        for (size_t in_idx = 0; in_idx < params.size(); ++in_idx) {
+            matched_parameters.insert({ref_params[in_idx], params[in_idx]});
+        }
+
+        auto compiledModelRef = core->compile_model(functionRefs, ov::test::utils::DEVICE_TEMPLATE);
+        inferRequestRef = compiledModelRef.create_infer_request();
+    }
+
+    std::vector<InputShape> inputShapes;
+    const ov::element::Type netPrc = ElementType::f32;
+    ov::InferRequest inferRequestRef;
+    ov::element::Type statePrc;
+};
+
+// ReadValue Assign direct pair
+//
+//             input_1   input_2
+//                |        |
+//              Add_1     /
+//                \      /
+//                 MatMul
+//                   |
+//   input_0     ReadValue ..........
+//       \      /       \           .
+//         Add_0      Assign ........
+//          |
+//        Result
+
+class InitGraphStatefulModel : public InitGraphStatefulModelBase {
+public:
+    void SetUp() override {
+        targetDevice = utils::DEVICE_CPU;
+
+        bool directPair;
+        std::tie(inputShapes, directPair) = this->GetParam();
+
+        init_input_shapes(inputShapes);
+        ov::ParameterVector input_params;
+        for (auto&& shape : inputDynamicShapes) {
+            input_params.push_back(std::make_shared<ov::op::v0::Parameter>(netPrc, shape));
+        }
+
+        input_params[0]->set_friendly_name("input_0");
+        input_params[1]->set_friendly_name("input_1");
+        input_params[2]->set_friendly_name("input_2");
+
+        // init_graph
+        auto add_1 =
+            std::make_shared<ov::op::v1::Add>(input_params[1], ov::op::v0::Constant::create(netPrc, {1}, {1.0f}));
+        add_1->set_friendly_name("init_graph/add_1");
+        auto mm_0 = std::make_shared<ov::op::v0::MatMul>(add_1, input_params[2]);
+        mm_0->set_friendly_name("init_graph/mm_0");
+
+        const std::string variable_name("var_direct_pair");
+        statePrc = netPrc;
+        auto variable = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{{inputDynamicShapes[1][0], inputDynamicShapes[2][1]}, statePrc, variable_name});
+
+        auto read = std::make_shared<ov::op::v6::ReadValue>(mm_0, variable);
+        std::shared_ptr<ov::Node> add_0 = std::make_shared<ov::op::v1::Add>(input_params[0], read);
+        add_0->set_friendly_name("add_0");
+        auto assign = std::make_shared<ov::op::v6::Assign>(directPair ? read : add_0, variable);
+        auto res = std::make_shared<ov::op::v0::Result>(add_0);
+        function = std::make_shared<ov::Model>(ov::ResultVector({res}), ov::SinkVector({assign}), input_params);
+    }
+
+    void check_init_graph_node() override {
+        // Node with friendly name "init_graph/add_1" and init_graph/mm_0 should be moved into subgraph.
+        CheckNumberOfNodesWithType(compiledModel, "Add", 0);
+        CheckNumberOfNodesWithType(compiledModel, "MatMul", 0);
+    }
+
+    ov::Shape get_state_shape(size_t i) override {
+        return ov::Shape({inputShapes[1].second[i][0], inputShapes[2].second[i][1]});
+    }
+};
+
+TEST_P(InitGraphStatefulModel, CompareWithRefs) {
+    run();
+}
+
+// ReadValueWithSubgraph have different precision.
+//
+//         input[fp32]
+//            |
+//       Convert[fp32->fp16]
+//            |
+//        ReadValue ..........
+//       /       \           .
+//     Add      Assign .......
+//      |
+//    Result
+class InitGraphStatefulDiffPrimitiveModel : public InitGraphStatefulModelBase {
+public:
+    void SetUp() override {
+        targetDevice = utils::DEVICE_CPU;
+
+        configuration.insert({"SNIPPETS_MODE", "DISABLE"});
+
+        bool directPair;
+        std::tie(inputShapes, directPair) = this->GetParam();
+
+        init_input_shapes(inputShapes);
+        ov::ParameterVector input_params;
+        for (auto&& shape : inputDynamicShapes) {
+            input_params.push_back(std::make_shared<ov::op::v0::Parameter>(netPrc, shape));
+        }
+
+        input_params[0]->set_friendly_name("input");
+
+        // init_graph
+        auto convert = std::make_shared<ov::op::v0::Convert>(input_params[0], ov::element::f16);
+        convert->set_friendly_name("init_graph/convert");
+
+        const std::string variable_name("var_diff_precison");
+        statePrc = ov::element::f16;
+        auto variable = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{{inputDynamicShapes[0]}, statePrc, variable_name});
+
+        auto readvalue = std::make_shared<ov::op::v6::ReadValue>(convert, variable);
+
+        std::shared_ptr<ov::Node> add =
+            std::make_shared<ov::op::v1::Add>(readvalue, ov::op::v0::Constant::create(ov::element::f16, {1}, {1.0f}));
+
+        auto assign = std::make_shared<ov::op::v6::Assign>(directPair ? readvalue : add, variable);
+
+        auto res = std::make_shared<ov::op::v0::Result>(add);
+
+        function = std::make_shared<ov::Model>(ov::ResultVector({res}), ov::SinkVector({assign}), input_params);
+    }
+
+    void check_init_graph_node() override {
+        CheckNumberOfNodesWithType(compiledModel, "Convert", 1);
+    }
+
+    ov::Shape get_state_shape(size_t i) override {
+        return inputShapes[0].second[i];
+    }
+};
+
+TEST_P(InitGraphStatefulDiffPrimitiveModel, CompareWithRefs) {
+    run();
+}
+
+namespace {
+const std::vector<std::vector<InputShape>> inputShapes = {
+    {
+        // Dynamic shape.
+        {{1, -1}, {{1, 2}, {1, 2}, {1, 1}}},
+        {{2, -1}, {{2, 3}, {2, 10}, {2, 1}}},
+        {{-1, 2}, {{3, 2}, {10, 2}, {1, 2}}},
+    },
+    {
+        // Static shape.
+        {{1, 1}, {{1, 1}}},
+        {{4, 2}, {{4, 2}}},
+        {{2, 10}, {{2, 10}}},
+    }
+};
+
+const std::vector<bool> readValueAssginDirectPair = {true, false};
+
+const auto testParams_smoke = ::testing::Combine(
+    ::testing::ValuesIn(inputShapes),
+    ::testing::ValuesIn(readValueAssginDirectPair));
+
+INSTANTIATE_TEST_SUITE_P(smoke_StatefulInitGraph,
+                         InitGraphStatefulModel,
+                         testParams_smoke,
+                         InitGraphStatefulModel::getTestCaseName);
+
+
+const std::vector<std::vector<InputShape>> inputShapesDiffPrecision = {
+    {
+        // Dynamic shape.
+        {{1, -1}, {{1, 10}, {1, 1}}},
+    },
+    {
+        // Static shape.
+        {{1, 1}, {{1, 1}}},
+    }
+};
+
+const auto testParamsDiffPrecision_smoke = ::testing::Combine(
+    ::testing::ValuesIn(inputShapesDiffPrecision),
+    ::testing::ValuesIn(readValueAssginDirectPair));
+
+INSTANTIATE_TEST_SUITE_P(smoke_StatefulInitGraph,
+                         InitGraphStatefulDiffPrimitiveModel,
+                         testParamsDiffPrecision_smoke,
+                         InitGraphStatefulDiffPrimitiveModel::getTestCaseName);
+
+}  // namespace
+
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/bf16_convert_saturation.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/bf16_convert_saturation.cpp
new file mode 100644
index 00000000000000..96c08eeffed15a
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/bf16_convert_saturation.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "utils/fusing_test_utils.hpp"
+
+using namespace CPUTestUtils;
+namespace ov {
+namespace test {
+/*
+  This test aims to cover Eltwise node BF16 output precision conversion logic in "saturation" mode. In this test, we
+  have a select node with condition input of boolean type and then/else inputs of f32 type(as constant node with bf16
+  overflow data). The select node is followed by a convolution node to ensoure that it is converted to bf16 precision.
+*/
+using selectParams = std::tuple<InputShape,  // Condition shapes
+                                ElementType  // Then/Else precision
+                                >;
+class BF16ConvertSaturation : public testing::WithParamInterface<selectParams>,
+                              virtual public SubgraphBaseTest,
+                              public CpuTestWithFusing {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<selectParams> obj) {
+        InputShape shapes;
+        ElementType precision;
+        std::tie(shapes, precision) = obj.param;
+
+        std::ostringstream result;
+        result << "Condition_prc_" << ElementType::boolean << "_Then_Else_prc_" << precision << "_";
+        result << "IS=(" << shapes.first << ")_TS=(";
+        for (const auto& item : shapes.second) {
+            result << ov::test::utils::vec2str(item) << "_";
+        }
+        result << "PluginConf_inference_precision=bf16";
+
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        abs_threshold = 0;
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        InputShape shapes;
+        ElementType precision;
+        std::tie(shapes, precision) = this->GetParam();
+        init_input_shapes({shapes});
+        std::tie(inFmts, outFmts, priority, selectedType) = emptyCPUSpec;
+        selectedType = makeSelectedTypeStr(getPrimitiveType(), ov::element::i8);
+        ov::element::TypeVector types{ov::element::boolean, precision, precision};
+        ov::ParameterVector parameters;
+        auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::boolean, inputDynamicShapes[0]);
+        parameters.push_back(param);
+
+        ov::test::utils::InputGenerateData in_data;
+        in_data.start_from = -3.40282e+38;
+        in_data.range = 1;
+        in_data.resolution = 1;
+        auto thenTensor = ov::test::utils::create_and_fill_tensor(precision, ov::Shape{1}, in_data);
+
+        in_data.start_from = 3.40282e+38;
+        in_data.range = 10;
+        in_data.resolution = 2;
+        auto elseTensor = ov::test::utils::create_and_fill_tensor(precision, ov::Shape{2, 1, 32, 32}, in_data);
+
+        auto select = std::make_shared<ov::op::v1::Select>(parameters[0],
+                                                           std::make_shared<ov::op::v0::Constant>(thenTensor),
+                                                           std::make_shared<ov::op::v0::Constant>(elseTensor),
+                                                           ov::op::AutoBroadcastType::NUMPY);
+
+        auto conv_filter_shape = ov::Shape{1, 1, 3, 3};
+        auto conv_filter = ov::op::v0::Constant::create(ElementType::f32, conv_filter_shape, {1});
+        auto strides = ov::Strides{1, 1};
+        auto pads_begin = ov::CoordinateDiff{0, 0};
+        auto pads_end = ov::CoordinateDiff{0, 0};
+        auto dilations = ov::Strides{1, 1};
+        auto conv =
+            std::make_shared<ov::op::v1::Convolution>(select, conv_filter, strides, pads_begin, pads_end, dilations);
+
+        function = makeNgraphFunction(ElementType::f32, parameters, conv, "Eltwise");
+        configuration.insert({ov::hint::inference_precision(ov::element::bf16)});
+    }
+
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override {
+        inputs.clear();
+        const auto& modelInputs = function->inputs();
+        ov::test::utils::InputGenerateData in_data;
+        in_data.start_from = -1;
+        in_data.range = 3;
+        in_data.resolution = 2;
+        auto condTensor = ov::test::utils::create_and_fill_tensor(modelInputs[0].get_element_type(),
+                                                                  targetInputStaticShapes[0],
+                                                                  in_data);
+
+        inputs.insert({modelInputs[0].get_node_shared_ptr(), condTensor});
+    }
+};
+
+TEST_P(BF16ConvertSaturation, CompareWithRefs) {
+    run();
+}
+
+const std::vector<InputShape> inShapes = {
+    // Condition
+    {{-1, -1, -1, -1}, {{2, 1, 32, 32}}},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_BF16ConvertSaturationTest,
+                         BF16ConvertSaturation,
+                         ::testing::Combine(::testing::ValuesIn(inShapes), ::testing::Values(ElementType::f32)),
+                         BF16ConvertSaturation::getTestCaseName);
+
+}  // namespace test
+}  // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp
index 9ff4d0b989fefa..903b8c083b1a1f 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp
@@ -32,6 +32,17 @@ const std::vector<ov::element::Type> types = {
     ov::element::f64,
 };
 
+const std::vector<ov::element::Type> floatTypes = {
+    ov::element::f32,
+    ov::element::f16,
+    ov::element::bf16,
+};
+
+const std::vector<ov::element::Type> f8Types = {
+    ov::element::f8e4m3,
+    ov::element::f8e5m2,
+};
+
 INSTANTIATE_TEST_SUITE_P(smoke_ConversionLayerTest,
                          ConversionLayerTest,
                          ::testing::Combine(::testing::ValuesIn(conversionOpTypes),
@@ -49,4 +60,23 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConversionToBooleanLayerTest,
                                             ::testing::Values(ov::element::boolean),
                                             ::testing::Values(ov::test::utils::DEVICE_CPU)),
                          ConversionLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConversionToF8LayerTest,
+                         ConversionLayerTest,
+                         ::testing::Combine(::testing::Values(conversionOpTypes[0]),
+                                            ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(shapes)),
+                                            ::testing::ValuesIn(floatTypes),
+                                            ::testing::ValuesIn(f8Types),
+                                            ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                         ConversionLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConversionFromF8LayerTest,
+                         ConversionLayerTest,
+                         ::testing::Combine(::testing::Values(conversionOpTypes[0]),
+                                            ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(shapes)),
+                                            ::testing::ValuesIn(f8Types),
+                                            ::testing::ValuesIn(floatTypes),
+                                            ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                         ConversionLayerTest::getTestCaseName);
+
 }  // namespace
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index 7af707df602bfc..4c34b3fd2506ac 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -173,6 +173,8 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*smoke_TopK/TopKLayerTest.Inference.*_k=21_.*_sort=value_modelType=f16_trgDev=CPU.*)",
         // Issue: 121812
         R"(.*ConvertCPULayerTest.*outFmts=(nhwc|nChw8c|nChw16c).*)",
+        // Issue: MFDNN-12917. The oneDNN emitter of conversion from fp32 to fp8 has rounding issue.
+        R"(.*ConvertCPULayerTest.*(\[1.1.1080.1920\]|\(2.17.5.4\))_.*_inputPRC=f32_targetPRC=f8e4m3_.*)",
         // Need to generate sequence exactly in the i64 data type. Enable in scope of i64 enabling.
         R"(.*RandomUniformLayerTestCPU.*OutPrc=i64.*)",
         // Issue: 123815 (Tests are sensintive to available thread count on testing machines)
@@ -529,6 +531,7 @@ std::vector<std::string> disabledTestPatterns() {
         retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
         retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)");
         retVector.emplace_back(R"(.*ConcatSDPTest.*f16.*)");
+        retVector.emplace_back(R"(.*ConvertCPULayerTest.*f16.*)");
     }
 #elif defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM)
     if (!ov::intel_cpu::hasHardwareSupport(ov::element::f16)) {
@@ -536,6 +539,7 @@ std::vector<std::string> disabledTestPatterns() {
         retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
         retVector.emplace_back(R"(.*Prc=f16.*)");
         retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*HasShapeOf=1.*)");
+        retVector.emplace_back(R"(.*ConvertCPULayerTest.*f16.*)");
     } else {
         // Issue 117407
         retVector.emplace_back(
diff --git a/src/plugins/intel_cpu/tests/unit/transformations/readvalue_subgraph.cpp b/src/plugins/intel_cpu/tests/unit/transformations/readvalue_subgraph.cpp
new file mode 100644
index 00000000000000..3656130b579edd
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/unit/transformations/readvalue_subgraph.cpp
@@ -0,0 +1,232 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <transformations/cpu_opset/common/pass/move_readvalue_inputs_to_subgraph.hpp>
+#include <transformations/init_node_info.hpp>
+
+#include "common_test_utils/ov_test_utils.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/read_value.hpp"
+#include "transformations/cpu_opset/common/op/read_value_with_subgraph.hpp"
+
+using namespace testing;
+/****************************************************************
+ * Pattern 1 (From whisper decoder):
+ *  input                       input
+ *    |                           |
+ *  MatMul                      ReadValueWithSubgraph (MatMul)
+ *    |                ->         |   \
+ *  ReadValue                  Result  Assign
+ *    |     \
+ *  Result  Assign
+ ****************************************************************/
+static std::shared_ptr<ov::intel_cpu::ReadValueWithSubgraph> constructRVWithSubGraph(
+    std::shared_ptr<ov::op::v0::Parameter> input,
+    const ov::element::Type& type,
+    std::shared_ptr<ov::op::util::Variable> variable) {
+    auto mm_weights = std::make_shared<ov::op::v0::Constant>(type, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
+
+    auto func_input =
+        std::make_shared<ov::op::v0::Parameter>(input->get_element_type(), input->get_output_partial_shape(0));
+
+    auto matmul = std::make_shared<ov::op::v0::MatMul>(func_input, mm_weights, false, false);
+
+    auto func_output = std::make_shared<ov::op::v0::Result>(matmul);
+
+    auto func = std::make_shared<ov::Model>(ov::NodeVector({func_output}),
+                                            ov::ParameterVector{func_input},
+                                            "state_init_submodel");
+
+    auto readvalue = std::make_shared<ov::intel_cpu::ReadValueWithSubgraph>(variable, func);
+    readvalue->set_input(input->output(0), func_input);
+    readvalue->set_output(func_output);
+    readvalue->validate_and_infer_types();
+
+    return readvalue;
+}
+
+TEST(TransformationTests, ReadValueWithSubgraph_1) {
+    std::shared_ptr<ov::Model> model(nullptr), model_ref(nullptr);
+    {
+        const ov::PartialShape shape{1, 1, 2};
+        const ov::element::Type type = ov::element::f32;
+        std::shared_ptr<ov::op::util::Variable> variable = std::make_shared<ov::op::util::Variable>(
+            ov::op::util::VariableInfo{ov::PartialShape{1, 1, 2}, type, "var_id"});
+
+        {
+            auto input = std::make_shared<ov::op::v0::Parameter>(type, shape);
+
+            auto mm_weights =
+                std::make_shared<ov::op::v0::Constant>(type, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
+
+            auto matmul = std::make_shared<ov::op::v0::MatMul>(input, mm_weights, false, false);
+
+            auto readvalue = std::make_shared<ov::op::v6::ReadValue>(matmul, variable);
+
+            auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, variable);
+
+            auto result = std::make_shared<ov::op::v0::Result>(readvalue);
+            model = std::make_shared<ov::Model>(ov::ResultVector{result},
+                                                ov::SinkVector{assign},
+                                                ov::ParameterVector{input});
+
+            ov::pass::Manager manager;
+            manager.register_pass<ov::intel_cpu::MoveReadValueInputsToSubgraph>();
+            manager.run_passes(model);
+        }
+        {
+            auto input = std::make_shared<ov::op::v0::Parameter>(type, shape);
+
+            auto readvalue = constructRVWithSubGraph(input, type, variable);
+
+            auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, variable);
+
+            auto result = std::make_shared<ov::op::v0::Result>(readvalue);
+
+            model_ref = std::make_shared<ov::Model>(ov::ResultVector{result},
+                                                    ov::SinkVector{assign},
+                                                    ov::ParameterVector{input});
+        }
+        auto res = compare_functions(model, model_ref, 0, 0, 0, 0, 0, 0);
+        ASSERT_TRUE(res.first) << res.second;
+    }
+}
+
+/***************************************************************************************************
+ * Pattern 2 (Complex pattern):
+ *           input                                  input
+ *             |                                      |
+ *          Convert                                Convert
+ *         /   |   \                              /   |    \
+ *        /    |    \                            /   Add2   \
+ *     Add1   Add2   \                          |     |  \   |
+ *      |      |  \   |         --->            |     |   Add3
+ *       \     |   Add3                         |     |   /   \
+ *        \    |   /   \               ReadValueWithSubgraph  Result2     Subgraph(Add1, Add4, Add5)
+ *         \  Add4      \                           /   \
+ *          \  |         \                     Result1  Assign
+ *           Add5       Result2
+ *             |
+ *          ReadValue
+ *           /   \
+ *      Result1  Assign
+ *
+ ***************************************************************************************************/
+
+static std::shared_ptr<ov::op::v0::Constant> create_const_node(ov::Shape shape) {
+    return std::make_shared<ov::op::v0::Constant>(ov::element::i32, shape, std::vector<int32_t>{1});
+}
+
+static std::shared_ptr<ov::intel_cpu::ReadValueWithSubgraph> constructRVWithSubGraph2(
+    ov::NodeVector inputs,
+    const ov::element::Type& type,
+    std::shared_ptr<ov::op::util::Variable> variable) {
+    ov::ParameterVector func_inputs;
+    for (auto input : inputs) {
+        auto func_input =
+            std::make_shared<ov::op::v0::Parameter>(input->get_element_type(), input->get_output_partial_shape(0));
+        func_inputs.push_back(func_input);
+    }
+
+    auto add1 = std::make_shared<ov::op::v1::Add>(func_inputs[0], create_const_node(ov::Shape{4}));
+
+    auto add4 = std::make_shared<ov::op::v1::Add>(func_inputs[1], func_inputs[2]);
+
+    auto add5 = std::make_shared<ov::op::v1::Add>(add1, add4);
+
+    auto func_output = std::make_shared<ov::op::v0::Result>(add5);
+
+    auto func = std::make_shared<ov::Model>(ov::NodeVector({func_output}), func_inputs, "state_init_submodel");
+
+    auto readvalue = std::make_shared<ov::intel_cpu::ReadValueWithSubgraph>(variable, func);
+    for (size_t i = 0; i < inputs.size(); i++) {
+        readvalue->set_input(inputs[i]->output(0), func_inputs[i]);
+    }
+    readvalue->set_output(func_output);
+    readvalue->validate_and_infer_types();
+
+    return readvalue;
+}
+
+TEST(TransformationTests, ReadValueWithSubgraph_2) {
+    std::shared_ptr<ov::Model> model(nullptr), model_ref(nullptr);
+    {
+        const ov::PartialShape shape{1, 2, 4};
+        const ov::element::Type in_type = ov::element::f32;
+        const ov::element::Type out_type = ov::element::i32;
+
+        std::shared_ptr<ov::op::util::Variable> variable =
+            std::make_shared<ov::op::util::Variable>(ov::op::util::VariableInfo{shape, out_type, "var_id"});
+
+        {
+            auto input = std::make_shared<ov::op::v0::Parameter>(in_type, shape);
+            input->set_friendly_name("input");
+
+            auto convert = std::make_shared<ov::op::v0::Convert>(input, out_type);
+            convert->set_friendly_name("convert");
+
+            auto add1 = std::make_shared<ov::op::v1::Add>(convert, create_const_node(ov::Shape{4}));
+            add1->set_friendly_name("add1");
+
+            auto add2 = std::make_shared<ov::op::v1::Add>(convert, create_const_node(ov::Shape{4}));
+            add2->set_friendly_name("add2");
+
+            auto add3 = std::make_shared<ov::op::v1::Add>(add2, convert);
+            add3->set_friendly_name("add3");
+
+            auto add4 = std::make_shared<ov::op::v1::Add>(add2, add3);
+            add4->set_friendly_name("add4");
+
+            auto add5 = std::make_shared<ov::op::v1::Add>(add1, add4);
+            add5->set_friendly_name("add5");
+
+            auto readvalue = std::make_shared<ov::op::v6::ReadValue>(add5, variable);
+            readvalue->set_friendly_name("readvalue");
+
+            auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, variable);
+            assign->set_friendly_name("assign");
+
+            auto result1 = std::make_shared<ov::op::v0::Result>(readvalue);
+            result1->set_friendly_name("result1");
+
+            auto result2 = std::make_shared<ov::op::v0::Result>(add3);
+            result2->set_friendly_name("result2");
+
+            model = std::make_shared<ov::Model>(ov::ResultVector{result1, result2},
+                                                ov::SinkVector{assign},
+                                                ov::ParameterVector{input});
+
+            ov::pass::Manager manager;
+            manager.register_pass<ov::intel_cpu::MoveReadValueInputsToSubgraph>();
+            manager.run_passes(model);
+        }
+        {
+            auto input = std::make_shared<ov::op::v0::Parameter>(in_type, shape);
+
+            auto convert = std::make_shared<ov::op::v0::Convert>(input, out_type);
+
+            auto add2 = std::make_shared<ov::op::v1::Add>(convert, create_const_node(ov::Shape{4}));
+
+            auto add3 = std::make_shared<ov::op::v1::Add>(add2, convert);
+
+            auto readvalue = constructRVWithSubGraph2({convert, add2, add3}, out_type, variable);
+
+            auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, variable);
+
+            auto result1 = std::make_shared<ov::op::v0::Result>(readvalue);
+
+            auto result2 = std::make_shared<ov::op::v0::Result>(add3);
+
+            model_ref = std::make_shared<ov::Model>(ov::ResultVector{result1, result2},
+                                                    ov::SinkVector{assign},
+                                                    ov::ParameterVector{input});
+        }
+        auto res = compare_functions(model, model_ref, 0, 0, 0, 0, 0, 0);
+        ASSERT_TRUE(res.first) << res.second;
+    }
+}
\ No newline at end of file
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
index c7524f1880157d..0950614897ab43 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
@@ -267,6 +267,7 @@ REGISTER_FACTORY(v13, ScaledDotProductAttention);
 REGISTER_FACTORY(v13, BitwiseAnd);
 REGISTER_FACTORY(v13, BitwiseOr);
 REGISTER_FACTORY(v13, BitwiseXor);
+REGISTER_FACTORY(v13, FakeConvert);
 
 // ------------------------------ Supported v15 ops ----------------------------- //
 REGISTER_FACTORY(v15, ROIAlignRotated);
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/fake_convert.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/fake_convert.hpp
new file mode 100644
index 00000000000000..c16af0be51abda
--- /dev/null
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/fake_convert.hpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "primitive.hpp"
+#include <vector>
+
+namespace cldnn {
+
+/// @brief FakeConvert performs element-wise quantization of input values
+///        into a set of values corresponding to a target low-precision type.
+struct fake_convert : public primitive_base<fake_convert> {
+    CLDNN_DECLARE_PRIMITIVE(fake_convert)
+
+    fake_convert() : primitive_base("", {}) {}
+
+    /// @brief Constructs fake_convert primitive.
+    /// @param id This primitive id.
+    /// @param input Input primitive id.
+    /// @param scale Scale primitive id.
+    /// @param shift Shift primitive id.
+    /// @param destination_type The low precision type to be emulated.
+    fake_convert(const primitive_id& id,
+             const input_info& input,
+             const input_info& scale,
+             const input_info& shift,
+             ov::element::Type destination_type = ov::element::Type_t::f8e4m3)
+        : primitive_base(id, {input, scale, shift}, 1), destination_type(destination_type) {}
+
+    /// @brief Constructs fake_convert primitive.
+    /// @param id This primitive id.
+    /// @param input Input primitive id.
+    /// @param scale Scale primitive id.
+    /// @param shift Shift primitive id.
+    /// @param destination_type The low precision type to be emulated.
+    fake_convert(const primitive_id& id,
+             const input_info& input,
+             const input_info& scale,
+             ov::element::Type destination_type = ov::element::Type_t::f8e4m3)
+        : primitive_base(id, {input, scale}, 1), destination_type(destination_type) {}
+
+    ov::element::Type destination_type;
+
+    size_t hash() const override {
+        size_t seed = primitive::hash();
+        seed = hash_combine(seed, destination_type.get_type_name());
+        return seed;
+    }
+
+    bool operator==(const primitive& rhs) const override {
+        if (!compare_common_params(rhs))
+            return false;
+        auto rhs_casted = downcast<const fake_convert>(rhs);
+        return (destination_type == rhs_casted.destination_type);
+    }
+
+    void save(BinaryOutputBuffer& ob) const override {
+        primitive_base<fake_convert>::save(ob);
+        ob << make_data(&destination_type, sizeof(destination_type));
+    }
+
+    void load(BinaryInputBuffer& ib) override {
+        primitive_base<fake_convert>::load(ib);
+        ib >> make_data(&destination_type, sizeof(destination_type));
+    }
+};
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/implementation_desc.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/implementation_desc.hpp
index e84311a9cfb592..c83b1127e2d44c 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/implementation_desc.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/implementation_desc.hpp
@@ -19,6 +19,7 @@ enum class impl_types : uint8_t {
     ocl = 1 << 2,
     onednn = 1 << 3,
     sycl = 1 << 4,
+    cm = 1 << 5,
     any = 0xFF,
 };
 
@@ -43,6 +44,7 @@ inline std::ostream& operator<<(std::ostream& out, const impl_types& impl_type)
         case impl_types::common: out << "common"; break;
         case impl_types::ocl: out << "ocl"; break;
         case impl_types::onednn: out << "onednn"; break;
+        case impl_types::cm: out << "cm"; break;
         case impl_types::any: out << "any"; break;
         default: out << "unknown"; break;
     }
@@ -61,6 +63,8 @@ inline std::istream& operator>>(std::istream& is, impl_types& impl_type) {
         impl_type = impl_types::ocl;
     } else if (str == "onednn") {
         impl_type = impl_types::onednn;
+    } else if (str == "cm") {
+        impl_type = impl_types::cm;
     } else if (str == "any") {
         impl_type = impl_types::any;
     } else {
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp
index f87f608597a6bb..2638f2ad60cf26 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp
@@ -24,6 +24,10 @@ struct paged_attention : public primitive_base<paged_attention> {
         OPENVINO_ASSERT(inputs.size() == 13, "[GPU] Unexpected inputs number for PagedAttention primitive: ", inputs.size());
     }
 
+    bool has_scores_output() const {
+        return num_outputs == 2;
+    }
+
     bool operator==(const primitive& rhs) const override {
         return compare_common_params(rhs);
     }
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp
index 09dfcf68f05725..9a26768d0fc068 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp
@@ -16,6 +16,11 @@ struct work_group_sizes {
     std::vector<size_t> local;
 };
 
+enum class kernel_language {
+    OCLC,
+    CM,
+};
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Scalar
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -122,8 +127,10 @@ struct kernel_string {
     std::string entry_point;
     bool batch_compilation;
     bool has_microkernels;
+    kernel_language language;
 
-    kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false), has_microkernels(false) {}
+    kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""),
+    batch_compilation(false), has_microkernels(false), language(kernel_language::OCLC) {}
 
     std::string get_str() const { return str + jit + undefs + options + entry_point; }
     size_t get_hash() const { return std::hash<std::string>()(get_str()); }
diff --git a/src/plugins/intel_gpu/src/graph/fake_convert.cpp b/src/plugins/intel_gpu/src/graph/fake_convert.cpp
new file mode 100644
index 00000000000000..b201378d52cc8d
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/fake_convert.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "fake_convert_inst.h"
+#include "fake_convert_shape_inference.hpp"
+
+#include "primitive_type_base.h"
+#include "intel_gpu/runtime/error_handler.hpp"
+#include "json_object.h"
+#include <string>
+
+namespace cldnn {
+GPU_DEFINE_PRIMITIVE_TYPE_ID(fake_convert)
+
+layout fake_convert_inst::calc_output_layout(fake_convert_node const& node, kernel_impl_params const& impl_param) {
+    return calc_output_layouts<ov::PartialShape>(node, impl_param)[0];
+}
+
+template<typename ShapeType>
+std::vector<layout> fake_convert_inst::calc_output_layouts(fake_convert_node const& node, kernel_impl_params const& impl_param) {
+    const auto& input_layout = impl_param.get_input_layout(0);
+    auto output_type = ov::element::Type(input_layout.data_type);
+
+    OPENVINO_ASSERT(ov::element::Type::merge(output_type, output_type, ov::element::Type(impl_param.get_input_layout(1).data_type)),
+        "Mixed input types are not supported.");
+
+    if (impl_param.input_layouts.size() == 3) {
+        OPENVINO_ASSERT(ov::element::Type::merge(output_type, output_type, ov::element::Type(impl_param.get_input_layout(2).data_type)),
+            "Mixed input types are not supported.");
+    }
+
+    switch (output_type) {
+    case ov::element::bf16:
+    case ov::element::f16:
+    case ov::element::f32:
+        break;
+    default:
+        OPENVINO_THROW("The output data type should be a bf16, f16, f32 but got: ", output_type);
+    }
+
+    return { layout{input_layout.get_partial_shape(), output_type, input_layout.format} };
+}
+
+template std::vector<layout> fake_convert_inst::calc_output_layouts<ov::PartialShape>(fake_convert_node const& node, const kernel_impl_params& impl_param);
+
+std::string fake_convert_inst::to_string(fake_convert_node const& node) {
+    auto desc = node.get_primitive();
+    auto node_info = node.desc_to_json();
+    auto& input = node.input();
+    auto& scale = node.scale();
+
+    std::stringstream primitive_description;
+
+    json_composite fake_convert_info;
+    fake_convert_info.add("input id", input.id());
+    fake_convert_info.add("scale id", scale.id());
+    if (node.has_shift()) {
+        fake_convert_info.add("shift id", node.shift().id());
+    }
+    fake_convert_info.add("destination_type", node.get_destination_type().get_type_name());
+
+    node_info->add("fake_convert info", fake_convert_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+fake_convert_inst::typed_primitive_inst(network& network, fake_convert_node const& node)
+    : parent(network, node) {}
+
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp
index 9539117bcf4b18..a40c7dfebb9de6 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "broadcast_inst.h"
 #include "shape_of_inst.h"
 #include "read_value_inst.h"
 #include "reshape_inst.h"
@@ -86,6 +87,13 @@ bool mark_shape_of_subgraphs::can_mark_node(const program_node& node) {
         return false;
     }
 
+    // skip mark_node for broadcast node if dependency nodes are data and shape_of
+    auto& dependencies = node.get_dependencies();
+    if (node.is_type<broadcast>() && dependencies.size() == 2) {
+        if (dependencies[0].first->is_type<data>() && dependencies[1].first->is_type<shape_of>())
+            return false;
+    }
+
     return true;
 }
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
index 1e5f943600fc05..ac7810c6e9154c 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -295,6 +295,12 @@ void remove_redundant_reorders::run(program& p) {
         auto o_layout = r_node.get_output_layout();
         const auto& i_layout = r_node.get_input_layout(0);
 
+        auto is_r_node_rank_changed = r_node.get_output_layout().get_rank() != r_node.get_dependency(0).get_output_layout().get_rank();
+        if (is_r_node_rank_changed &&
+            ((!update_implementations && r_node.get_dependency(0).is_type<crop>()) ||
+             (r_node.get_dependency(0).is_type<crop>() && r_node.get_dependency(0).can_be_optimized())))
+            continue;
+
         // Optimize reorder b_fs_yx_fsv16 -> bfyx when spatials are equal to 1. In this case we can reinterpret buffer,
         // but pads need to be handled correctly.
         if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx && !r_node.is_output() &&
diff --git a/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.cpp b/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.cpp
new file mode 100644
index 00000000000000..c4ec8da18c7136
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.cpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "impls/cm/impl_example.hpp"
+
+#include "fully_connected/cm/fully_connected_cm_kernel_selector.h"
+#include "fully_connected/fully_connected_params.h"
+#include "fully_connected_inst.h"
+#include "impls/ocl/primitive_base.hpp"
+
+namespace cldnn {
+namespace cm {
+
+struct example_impl : ocl::typed_primitive_impl_ocl<fully_connected> {
+    using parent = typed_primitive_impl_ocl<fully_connected>;
+    using parent::parent;
+    using kernel_selector_t = kernel_selector::fully_connected_cm_kernel_selector;
+    using kernel_params_t = kernel_selector::fully_connected_params;
+
+    DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cm::example_impl)
+
+    example_impl() = default;
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_deep_copy<example_impl, kernel_params_t>(*this);
+    }
+
+protected:
+    kernel_arguments_data get_arguments(const typed_primitive_inst<fully_connected>& instance) const override {
+        kernel_arguments_data args = parent::get_arguments(instance);
+        const auto& desc = instance.get_typed_desc<fully_connected>();
+
+        args.weights = instance.weights_memory();
+        args.bias = instance.bias_term() ? instance.bias_memory() : nullptr;
+
+        args.inputs = {instance.input_memory_ptr(0)};
+        size_t in_id = instance.bias_term() ? 3 : 2;
+        if (!desc->decompression_scale.empty())
+            args.inputs.push_back(instance.dep_memory_ptr(in_id++));
+
+        if (!desc->decompression_zero_point.empty())
+            args.inputs.push_back(instance.dep_memory_ptr(in_id));
+
+        return args;
+    }
+
+public:
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
+        auto params = get_weights_bias_default_params<kernel_selector::fully_connected_params>(impl_param,
+                                                                                               false,
+                                                                                               is_shape_agnostic);
+        return params;
+    }
+};
+std::unique_ptr<primitive_impl> ExampleImplementationManager::create_impl(const program_node& node,
+                                                                          const kernel_impl_params& params) const {
+    OPENVINO_ASSERT(node.is_type<fully_connected>());
+    return ocl::typed_primitive_impl_ocl<fully_connected>::create<example_impl>(
+        static_cast<const fully_connected_node&>(node),
+        params);
+}
+}  // namespace cm
+}  // namespace cldnn
+
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cm::example_impl)
diff --git a/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.hpp b/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.hpp
new file mode 100644
index 00000000000000..0208da12a2f378
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/cm/impl_example.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <memory>
+
+#include "fully_connected_inst.h"
+#include "impls/registry/implementation_manager.hpp"
+
+namespace cldnn {
+namespace cm {
+
+struct ExampleImplementationManager : public ImplementationManager {
+    OV_GPU_PRIMITIVE_IMPL("cm::example")
+    ExampleImplementationManager(shape_types shape_type, ValidateFunc vf = nullptr)
+        : ImplementationManager(impl_types::cm, shape_type, vf) {}
+
+    std::unique_ptr<primitive_impl> create_impl(const program_node& node,
+                                                const kernel_impl_params& params) const override;
+
+    bool validate_impl(const program_node& node) const override {
+        assert(node.is_type<fully_connected>());
+
+        auto &engine = node.get_program().get_engine();
+        auto &config = node.get_program().get_config();
+        if (!check_cm_jit_support(engine, config)) {
+            return false;
+        }
+
+        // Example impl should not be chosen unless forced
+        return false;
+    }
+};
+
+}  // namespace cm
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp
new file mode 100644
index 00000000000000..a5f94741c40bf5
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp
@@ -0,0 +1,131 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "impls/cpu/cpu_impl_helpers.hpp"
+#include "register.hpp"
+#include "fake_convert_inst.h"
+#include "impls/registry/implementation_map.hpp"
+
+#include "openvino/op/fake_convert.hpp"
+
+namespace cldnn {
+namespace cpu {
+
+struct fake_convert_impl : public typed_primitive_impl<fake_convert> {
+    using parent = typed_primitive_impl<fake_convert>;
+    using parent::parent;
+
+    ov::element::Type destination_type;
+
+    std::shared_ptr<ov::op::v13::FakeConvert> op;
+
+    DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cpu::fake_convert_impl)
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<fake_convert_impl>(*this);
+    }
+
+    fake_convert_impl() : parent("fake_convert_cpu_impl") {}
+
+    explicit fake_convert_impl(const fake_convert_node& outer) {
+        set_node_params(outer);
+    }
+
+    void set_node_params(const program_node& arg) override {
+        OPENVINO_ASSERT(arg.is_type<fake_convert>(), "[GPU] Incorrect program_node type");
+        const auto& node = arg.as<fake_convert>();
+        destination_type = node.get_destination_type();
+    }
+
+    void save(BinaryOutputBuffer& ob) const override {
+        parent::save(ob);
+        ob << make_data(&destination_type, sizeof(destination_type));
+    }
+
+    void load(BinaryInputBuffer& ib) override {
+        parent::load(ib);
+        ib >> make_data(&destination_type, sizeof(destination_type));
+    }
+
+    event::ptr execute_impl(const std::vector<event::ptr>& events, fake_convert_inst& instance) override {
+        OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "fake_convert::execute_impl");
+        auto& stream = instance.get_network().get_stream();
+
+        const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl();
+
+        if (!pass_through_events) {
+            stream.wait_for_events(events);
+        }
+
+        auto params = instance.get_impl_params();
+
+        ov::TensorVector input_host_tensors;
+        ov::TensorVector output_host_tensors;
+
+        if (!op) {
+            op = std::make_shared<ov::op::v13::FakeConvert>();
+            op->set_destination_type(destination_type);
+        }
+
+        std::vector<memory::ptr> input_mem_ptrs;
+        for (size_t i = 0; i < instance.dependencies().size(); i++)
+            input_mem_ptrs.push_back(instance.dep_memory_ptr(i));
+
+        auto output_mem_ptr = instance.output_memory_ptr();
+
+        cldnn::mem_lock<uint8_t, mem_lock_type::read> output_lock(output_mem_ptr, stream);
+
+        for (size_t i = 0; i < input_mem_ptrs.size(); i++)
+            input_host_tensors.push_back(make_tensor(params->input_layouts[i], input_mem_ptrs[i]->lock(stream, mem_lock_type::read)));
+
+        output_host_tensors.push_back(make_tensor(params->output_layouts[0], output_lock.data()));
+
+        OPENVINO_ASSERT(op->evaluate(output_host_tensors, input_host_tensors),
+                        "[GPU] Couldn't execute fake_convert primitive with id ", instance.id());
+
+        if (pass_through_events) {
+            return stream.group_events(events);
+        }
+
+        return make_output_event(stream, instance.is_output());
+    }
+
+    void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
+
+    void update(primitive_inst& inst, const kernel_impl_params& impl_param) override {}
+
+public:
+    static std::unique_ptr<primitive_impl> create(const fake_convert_node& arg, const kernel_impl_params& impl_param) {
+        return make_unique<fake_convert_impl>();
+    }
+};
+
+
+namespace detail {
+
+attach_fake_convert_impl::attach_fake_convert_impl() {
+    auto formats = {
+        format::bfyx,
+        format::bfzyx,
+        format::bfwzyx,
+        format::bfuwzyx,
+        format::bfvuwzyx,
+    };
+
+    auto types = {
+        data_types::f32,
+        data_types::f16,
+        data_types::bf16
+    };
+
+    implementation_map<fake_convert>::add(impl_types::cpu, shape_types::static_shape, fake_convert_impl::create, types, formats);
+    implementation_map<fake_convert>::add(impl_types::cpu, shape_types::dynamic_shape, fake_convert_impl::create, types, formats);
+}
+
+}  // namespace detail
+}  // namespace cpu
+}  // namespace cldnn
+
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cpu::fake_convert_impl)
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::fake_convert)
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp
index 2b0dc5b212158c..e86628444de439 100644
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp
@@ -31,6 +31,7 @@ void register_implementations() {
     REGISTER_CPU(tile);
     REGISTER_CPU(select);
     REGISTER_CPU(reduce);
+    REGISTER_CPU(fake_convert);
 }
 
 }  // namespace cpu
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp
index cb89eae29d8c56..15cc4b11c077eb 100644
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp
@@ -56,6 +56,7 @@ REGISTER_CPU(broadcast);
 REGISTER_CPU(tile);
 REGISTER_CPU(select);
 REGISTER_CPU(reduce);
+REGISTER_CPU(fake_convert);
 
 #undef REGISTER_CPU
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
index 42d83a0265d290..7d54129195ccc6 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
@@ -121,6 +121,46 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) {
 
 namespace cldnn {
 
+bool check_cm_jit_support(cldnn::engine& e, const cldnn::ExecutionConfig& config) {
+    auto device = e.get_device().get();
+
+    static std::mutex m;
+    std::lock_guard<std::mutex> lock(m);
+
+    static std::map<cldnn::device*, bool> cache;
+    if (cache.find(device) != cache.end()) {
+        return cache.at(device);
+    }
+
+    std::shared_ptr<kernel_selector::KernelString> kernel_string = std::make_shared<kernel_selector::KernelString>();
+    // This program checks if cm sources can be jitted by current IGC version
+    const char* kernel_code = R""""(
+        #include <cm/cm.h>
+        #include <cm/cmtl.h>
+
+        extern "C" _GENX_MAIN_ void cm_check() {
+            unsigned int id = cm_linear_global_id();
+        }
+        )"""";
+
+    kernel_string->str = kernel_code;
+    kernel_string->options = " -cmc ";
+    kernel_string->entry_point = "cm_check";
+    kernel_string->batch_compilation = true;
+
+    try {
+        cldnn::kernel_impl_params dummy_params;
+        auto _kernels_cache_device_query = std::unique_ptr<cldnn::kernels_cache>(new cldnn::kernels_cache(e, config, 0));
+        _kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false);
+        _kernels_cache_device_query->build_all();
+        cache[device] = true;
+    } catch (std::exception&) {
+        cache[device] = false;
+    }
+
+    return cache.at(device);
+}
+
 bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config) {
     auto device = e.get_device().get();
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
index a8c715af98f198..bf8968fd4b255b 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
@@ -306,6 +306,7 @@ inline void update_shapes(kernel_selector::Params& p, const kernel_impl_params&
     }
 }
 
+bool check_cm_jit_support(cldnn::engine& e, const cldnn::ExecutionConfig& config);
 bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config);
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
index 5db452dcda26f0..b122195c8e1265 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
@@ -153,8 +153,12 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
             std::string entry_point = kernel_string->entry_point;
             std::string options = kernel_string->options;
             bool batch_compilation = kernel_string->batch_compilation;
+            bool is_cm = kernel_string->language == kernel_language::CM;
 
-            if (batch_compilation) {
+            auto& headers = is_cm ? cm_batch_headers : batch_headers;
+
+            // Order matters for cm options
+            if (batch_compilation && !is_cm) {
                 options = reorder_options(options);
             }
 
@@ -174,7 +178,7 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
                 const auto& batch_id = 0;
                 // increase bucket id if and only if new bucket comes
                 bucket_id = static_cast<int32_t>(program_buckets.size() - 1);
-                current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_headers));
+                current_bucket.push_back(batch_program(bucket_id, batch_id, options, headers, is_cm));
             }
 
             // This is a temporary walk-around to avoid severe performance drop.
@@ -205,7 +209,7 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
                 || current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end()
                 || need_separate_batch(entry_point)) {
                 const auto& batch_id = static_cast<int32_t>(current_bucket.size());
-                current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_headers));
+                current_bucket.push_back(batch_program(bucket_id, batch_id, options, headers, is_cm));
             }
 
             auto& current_batch = current_bucket.back();
@@ -270,12 +274,14 @@ kernels_cache::kernels_cache(engine& engine,
                              const ExecutionConfig& config,
                              uint32_t prog_id,
                              std::shared_ptr<ov::threading::ITaskExecutor> task_executor,
-                             const std::map<std::string, std::string>& batch_headers)
+                             const std::map<std::string, std::string>& batch_headers,
+                             const std::map<std::string, std::string>& cm_batch_headers)
     : _device(get_target_device(engine))
     , _task_executor(task_executor)
     , _config(config)
     , _prog_id(prog_id)
-    , batch_headers(std::move(batch_headers)) { }
+    , batch_headers(std::move(batch_headers))
+    , cm_batch_headers(std::move(cm_batch_headers)) { }
 
 static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
     // Get the size of the program binary in bytes.
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
index b08b087c55854a..1bb0ffbd2066bb 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
@@ -58,7 +58,11 @@ class kernels_cache {
         bool has_microkernels;
         std::map<std::string, std::pair<kernel_impl_params, size_t>> entry_point_to_id;
 
-        explicit batch_program(int32_t _bucket_id, int32_t _batch_id, std::string _options, const std::map<std::string, std::string>& batch_headers)
+        explicit batch_program(int32_t _bucket_id,
+                               int32_t _batch_id,
+                               std::string _options,
+                               const std::map<std::string, std::string>& batch_headers,
+                               bool is_cm = false)
             : bucket_id(_bucket_id),
               batch_id(_batch_id),
               hash_value(0),
@@ -68,17 +72,22 @@ class kernels_cache {
               dump_custom_program(false),
               has_microkernels(false),
               entry_point_to_id({}) {
-            static const std::vector<std::string> micro_kernel_include_names {
-                "generic_vector_ops",
-                "tile_ops",
-                "sdpa_utils"
-            };
-            for (const auto& kv : batch_headers) {
-                if (std::find(micro_kernel_include_names.begin(), micro_kernel_include_names.end(), kv.first) == micro_kernel_include_names.end()) {
-                    source.push_back(kv.second);
-                } else {
-                    micro_headers.push_back(kv.second);
+            if (!is_cm) {
+                static const std::vector<std::string> micro_kernel_include_names {
+                    "generic_vector_ops",
+                    "tile_ops",
+                    "sdpa_utils"
+                };
+                for (const auto& kv : batch_headers) {
+                    if (std::find(micro_kernel_include_names.begin(), micro_kernel_include_names.end(), kv.first) == micro_kernel_include_names.end()) {
+                        source.push_back(kv.second);
+                    } else {
+                        micro_headers.push_back(kv.second);
+                    }
                 }
+            } else {
+            for (const auto& kv : batch_headers)
+                source.push_back(kv.second);
             }
         }
     };
@@ -97,6 +106,7 @@ class kernels_cache {
     std::map<std::vector<unsigned char>, uint32_t> _cached_binaries;
     std::unordered_map<std::string, kernel::ptr> _cached_kernels;
     std::map<std::string, std::string> batch_headers;
+    std::map<std::string, std::string> cm_batch_headers;
     std::unordered_map<kernel_impl_params, size_t, impl_hasher> _kernel_batch_hash;
     void get_program_source(const kernels_code& kernels_source_code, std::vector<batch_program>*) const;
     void build_batch(const batch_program& batch, compiled_kernels& compiled_kernels);
@@ -112,7 +122,8 @@ class kernels_cache {
                            const ExecutionConfig& config,
                            uint32_t prog_id,
                            std::shared_ptr<ov::threading::ITaskExecutor> task_executor = nullptr,
-                           const std::map<std::string, std::string>& batch_headers = {});
+                           const std::map<std::string, std::string>& batch_headers = {},
+                           const std::map<std::string, std::string>& cm_batch_headers = {});
     kernel::ptr get_kernel_from_cached_kernels(std::string id) const;
     std::vector<kernel::ptr> get_kernels(const kernel_impl_params& params) const;
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp
index 9cf1a252564934..2bc377f2c1459a 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp
@@ -63,6 +63,7 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
 
     void load(BinaryInputBuffer& ib) override {
         parent::load(ib);
+        ib >> make_data(&has_scores_output, sizeof(bool));
         if (is_dynamic()) {
             auto& kv_cache_update_kernel_selector = kv_cache_update_kernel_selector_t::Instance();
             auto kv_cache_update_kernel_impl = kv_cache_update_kernel_selector.GetImplementation(_kernels_data[Stage::KV_CACHE_UPDATE].kernelName);
@@ -78,7 +79,45 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         }
     }
 
+    void save(BinaryOutputBuffer& ob) const override {
+        parent::save(ob);
+        ob << make_data(&has_scores_output, sizeof(bool));
+    }
+
     std::vector<layout> get_internal_buffer_layouts_impl() const override {
+        /*
+        * Internal buffers allocation owners and users:
+        * +--------------------------------------+--------------------+--------------------+
+        * | Stage                                | Allocates & uses   | Reuses             |
+        * +--------------------------------------+--------------------+--------------------+
+        * | KV_CACHE_UPDATE                      | [0, 1, 2]          |                    |
+        * +--------------------------------------+--------------------+--------------------+
+        * | SDPA (1st token)                     |                    | [0, 1, 2]          |
+        * +--------------------------------------+--------------------+--------------------+
+        * | PA_SDPA (2nd+ token)                 | [5, 6, 7]          |                    |
+        * +--------------------------------------+--------------------+--------------------+
+        * | PA_SDPA (mixed mode)                 | [5, 6, 7, 8]       |                    |
+        * +--------------------------------------+--------------------+--------------------+
+        * | SDPA (1st token) + scores output     |                    | [0, 1, 2, 3, 4]    |
+        * +--------------------------------------+--------------------+--------------------+
+        * | PA_SDPA (2nd+ token) + scores output | [3, 4, 5, 6, 7]    |                    |
+        * +--------------------------------------+--------------------+--------------------+
+        * | PA_SDPA (mixed mode) + scores output | [3, 4, 5, 6, 7, 8] |                    |
+        * +--------------------------------------+--------------------+--------------------+
+        *
+        * Description:
+        * 0, 1, 2 - Buffers used for proper blocks distribution for kv_cache_update and
+        *           sdpa_opt (1st token calculation) block configuration over target_seq_len dimension.
+        *           Filled in paged_attention_inst::on_execute() call.
+        * 3, 4    - Optional buffers used for PA scores output calculation, storing intermediate
+        *           softmax values by partitions (filled in PA/SDPA kernels) and sequence length offsets
+        *           for each subsequence (filled in paged_attention_inst::on_execute() call).
+        * 5, 6, 7 - Used for 2nd+ PA calculation (for softmax exp_sums, max_logits, and intermediate output).
+        *           Filled in PA/SDPA kernels.
+        * 8       - Optional buffer used for mixed PA execution mode, mapping gws idx to subsequence id.
+        *           Filled in paged_attention_inst::on_execute() call.
+        */
+
         auto add_internal_buffers = [](std::vector<layout>& layouts, const kernel_selector::KernelData& kd) {
             if (kd.internalBufferSizes.empty())
                 return;
@@ -133,6 +172,7 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
             args.outputs = { instance.output_memory_ptr(0) };
         } else if (stage == Stage::PA_SDPA) {
             if (kernel_idx == 0 || kernel_idx == 1) {
+                // 2nd+ token calculation or mixed stage tokens calculation
                 args.shape_info = instance.shape_info_memory_ptr();
 
                 args.inputs = { instance.input_memory_ptr(0),
@@ -155,7 +195,8 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
                 if (desc->has_alibi) {
                     args.inputs.push_back(instance.alibi_memory_ptr());
                 }
-            } else {
+            } else if (kernel_idx == 2 || kernel_idx == 3) {
+                // Finalization kernel or mixed stage finalization kernel
                 args.inputs = { instance.past_lens_memory_ptr() };
 
                 if (is_mixed_mode) {
@@ -163,17 +204,31 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
                     // dependency
                     args.inputs.push_back(instance.subsequence_begins_memory_ptr());
                 }
+            } else if (kernel_idx == 4) {
+                // Output scores calculation kernel
+                args.inputs = { instance.past_lens_memory_ptr(),
+                                instance.subsequence_begins_memory_ptr() };
             }
 
             args.outputs = { instance.output_memory_ptr(0) };
+
+            if (kernel_idx == 4) {
+                args.outputs.push_back(instance.output_memory_ptr(1));
+            }
         }
 
         return args;
     }
 
     std::set<size_t> get_lockable_internal_buffers() const override {
-        return std::set<size_t>{ 0, 1, 2, /* SDPA and KV_CACHE_UPDATE indexes configuration */
-                                 6, /* PA_SDPA multiple tokens mode */ };
+        size_t mixed_mode_buffer = has_scores_output ? 8 : 6;
+
+        std::set<size_t> lockable_ids = { 0, 1, 2, /* SDPA and KV_CACHE_UPDATE indexes configuration */
+                                          mixed_mode_buffer /* PA_SDPA multiple tokens mode */ };
+        if (has_scores_output)
+            lockable_ids.insert(4 /* Precalculated accumulated sequence length offsets for each subsequence */);
+
+        return lockable_ids;
     };
 
     void execute_stage(const std::vector<event::ptr>& events,
@@ -194,8 +249,17 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         if (stage == Stage::PA_SDPA) {
             internal_buffers_offset = _kernels_data[Stage::KV_CACHE_UPDATE].internalBufferSizes.size();
             internal_buffers_count = _kernels_data[Stage::PA_SDPA].internalBufferSizes.size();
-        } else {
+        } else if (stage == Stage::KV_CACHE_UPDATE) {
+            internal_buffers_count = _kernels_data[Stage::KV_CACHE_UPDATE].internalBufferSizes.size();
+        } else if (stage == Stage::SDPA) {
             internal_buffers_count = _kernels_data[Stage::KV_CACHE_UPDATE].internalBufferSizes.size();
+
+            const auto desc = instance.get_node().as<paged_attention>().get_primitive();
+            if (desc->has_scores_output()) {
+                // Add intermediate buffers for PagedAttention scores calculation:
+                // softmax_results, subsequence_offsets, exp_sums, max_logits, tmp_out
+                internal_buffers_count += 5;
+            }
         }
 
         for (size_t kd_idx = 0; kd_idx < _kernels_data[stage].kernels.size(); ++kd_idx) {
@@ -216,6 +280,23 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
                                       intermediate_memories.begin() + internal_buffers_offset,
                                       intermediate_memories.begin() + internal_buffers_offset + internal_buffers_count);
 
+            GPU_DEBUG_TRACE_DETAIL << "Execute stage=" << stage << " kernel=" << kd_idx << " " << _kernels_data[stage].kernelName << " start_offset="
+                                   << internal_buffers_offset << " count=" << internal_buffers_count << "\n";
+
+            GPU_DEBUG_TRACE_DETAIL << "Configured kernel arguments:\n";
+            for (size_t i = 0; i < _kernels_data[stage].kernels[kd_idx].params.arguments.size(); i++) {
+                GPU_DEBUG_TRACE_DETAIL << "\t" << i << ": type=" << static_cast<int>(_kernels_data[stage].kernels[kd_idx].params.arguments[i].t) << " "
+                                       << "index=" << _kernels_data[stage].kernels[kd_idx].params.arguments[i].index << "\n";
+            }
+
+            GPU_DEBUG_TRACE_DETAIL << "Memory buffers:"
+                                   << "shape_info=" << args.shape_info << " "
+                                   << "inputs=" << args.inputs.size() << " "
+                                   << "outputs=" << args.outputs.size() << " "
+                                   << "intermediates=" << args.intermediates.size() << " "
+                                   << "weights=" << args.weights << " "
+                                   << "scalars=" << (args.scalars ? args.scalars->size() : 0) << "\n";
+
             stream.set_arguments(*_kernels[idx_final], _kernels_data[stage].kernels[kd_idx].params, args);
 
             const auto& gws = params.workGroups.global;
@@ -242,10 +323,13 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
 
         execute_stage(events, instance, res_events, Stage::KV_CACHE_UPDATE, is_mixed_mode);
 
-        std::vector<event::ptr> dep_events(res_events.begin(), res_events.end());
         if (stage == PagedAttentionStage::PREFILL) {
+            std::vector<event::ptr> dep_events(res_events.begin(), res_events.end());
             execute_stage(dep_events, instance, res_events, Stage::SDPA, is_mixed_mode);
-        } else if (stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED) {
+        }
+
+        if (stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED || has_scores_output) {
+            std::vector<event::ptr> dep_events(res_events.begin(), res_events.end());
             execute_stage(dep_events, instance, res_events, Stage::PA_SDPA, is_mixed_mode);
         }
 
@@ -338,7 +422,7 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         return aligned_seq_len;
     }
 
-    static kernel_selector::sdpa_configuration get_sdpa_configuration(const kernel_impl_params& impl_param) {
+    static kernel_selector::sdpa_configuration get_sdpa_configuration(const kernel_impl_params& impl_param, bool is_dynamic = true) {
         kernel_selector::sdpa_configuration config;
 
         const auto desc = impl_param.typed_desc<paged_attention>();
@@ -362,37 +446,45 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
             config.group_size = desc->heads_num / desc->kv_heads_num;
         }
 
+        if (desc->has_scores_output() && !is_dynamic) {
+            const auto& input_mem = impl_param.memory_deps;
+            const auto max_context_len = input_mem.at(12);
+            mem_lock<int32_t, mem_lock_type::read> max_context_len_mem_lock(max_context_len, *impl_param.strm);
+            config.paged_attention_max_len = max_context_len_mem_lock[0];
+        }
+
         return config;
     }
 
     static kv_cache_update_kernel_params_t get_kv_cache_update_kernel_params(const kernel_impl_params& impl_param,
                                                                              const PagedAttentionStage& stage,
+                                                                             const kernel_selector::MultiDataTensor& input_tensors,
                                                                              bool is_dynamic = false) {
         auto params = get_default_params<kv_cache_update_kernel_params_t>(impl_param, is_dynamic);
 
-        const auto& key_layout = impl_param.get_input_layout(1);
-        const auto& value_layout = impl_param.get_input_layout(2);
-        const auto& key_cache_layout = impl_param.get_input_layout(3);
-        const auto& value_cache_layout = impl_param.get_input_layout(4);
-        const auto& past_lens_layout = impl_param.get_input_layout(5);
-        const auto& block_indices_layout = impl_param.get_input_layout(7);
-        const auto& block_indices_begins_layout = impl_param.get_input_layout(8);
-        const auto& subsequence_begins_layout = impl_param.get_input_layout(6);
+        const auto& key_tensor = input_tensors[1];
+        const auto& value_tensor = input_tensors[2];
+        const auto& key_cache_tensor = input_tensors[3];
+        const auto& value_cache_tensor = input_tensors[4];
+        const auto& past_lens_tensor = input_tensors[5];
+        const auto& block_indices_tensor = input_tensors[7];
+        const auto& block_indices_begins_tensor = input_tensors[8];
+        const auto& subsequence_begins_tensor = input_tensors[6];
 
         const auto inputs_number = 6;
         const auto outputs_number = 2;
         params.inputs.resize(inputs_number);
         params.outputs.resize(outputs_number);
-        params.inputs[0] = convert_data_tensor(key_layout);
-        params.inputs[1] = convert_data_tensor(value_layout);
-        params.inputs[2] = convert_data_tensor(past_lens_layout);
-        params.inputs[3] = convert_data_tensor(block_indices_layout);
-        params.inputs[4] = convert_data_tensor(block_indices_begins_layout);
-        params.inputs[5] = convert_data_tensor(subsequence_begins_layout);
-        params.outputs[0] = convert_data_tensor(key_cache_layout);
-        params.outputs[1] = convert_data_tensor(value_cache_layout);
+        params.inputs[0] = key_tensor;
+        params.inputs[1] = value_tensor;
+        params.inputs[2] = past_lens_tensor;
+        params.inputs[3] = block_indices_tensor;
+        params.inputs[4] = block_indices_begins_tensor;
+        params.inputs[5] = subsequence_begins_tensor;
+        params.outputs[0] = key_cache_tensor;
+        params.outputs[1] = value_cache_tensor;
 
-        params.conf = get_sdpa_configuration(impl_param);
+        params.conf = get_sdpa_configuration(impl_param, is_dynamic);
 
         params.is_prefill = stage == PagedAttentionStage::PREFILL || stage == PagedAttentionStage::MIXED;
 
@@ -418,18 +510,23 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         return params;
     }
 
-    static sdpa_kernel_params_t get_sdpa_kernel_params(const kernel_impl_params& impl_param, const PagedAttentionStage& stage, bool is_dynamic = false) {
+    static sdpa_kernel_params_t get_sdpa_kernel_params(const kernel_impl_params& impl_param,
+                                                       const PagedAttentionStage& stage,
+                                                       const kernel_selector::MultiDataTensor& input_tensors,
+                                                       bool is_dynamic = false) {
         const auto desc = impl_param.typed_desc<paged_attention>();
         auto params = get_default_params<sdpa_kernel_params_t>(impl_param, is_dynamic);
 
-        const auto& query_layout = impl_param.get_input_layout(0);
-        const auto& key_layout = impl_param.get_input_layout(1);
-        const auto& value_layout = impl_param.get_input_layout(2);
-        const auto& subsequence_begins_layout = impl_param.get_input_layout(6);
-        const auto& scale_layout = impl_param.get_input_layout(9);
-        const auto& alibi_layout = impl_param.get_input_layout(11);
-        const auto has_alibi = alibi_layout.count() > 0;
+        const auto& query_tensor = input_tensors[0];
+        const auto& key_tensor = input_tensors[1];
+        const auto& value_tensor = input_tensors[2];
+        const auto& subsequence_begins_tensor = input_tensors[6];
+        const auto& scale_tensor = input_tensors[9];
+        const auto& alibi_tensor = input_tensors[11];
+
+        const auto has_alibi = impl_param.get_input_layout(11).count() > 0;
         const auto has_scale_input = !desc->scale_val.has_value();
+        const auto has_scores_output = desc->has_scores_output();
 
         auto inputs_number = 4;
         if (has_scale_input)
@@ -440,18 +537,23 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
 
         auto input_idx = 0;
         params.inputs.resize(inputs_number);
-        params.inputs[input_idx++] = convert_data_tensor(query_layout);
-        params.inputs[input_idx++] = convert_data_tensor(key_layout);
-        params.inputs[input_idx++] = convert_data_tensor(value_layout);
-        params.inputs[input_idx++] = convert_data_tensor(subsequence_begins_layout);
+        params.inputs[input_idx++] = query_tensor;
+        params.inputs[input_idx++] = key_tensor;
+        params.inputs[input_idx++] = value_tensor;
+        params.inputs[input_idx++] = subsequence_begins_tensor;
 
         if (has_scale_input)
-            params.inputs[input_idx++] = convert_data_tensor(scale_layout);
+            params.inputs[input_idx++] = scale_tensor;
 
         if (has_alibi)
-            params.inputs[input_idx++] = convert_data_tensor(alibi_layout);
+            params.inputs[input_idx++] = alibi_tensor;
 
-        params.conf = get_sdpa_configuration(impl_param);
+        if (has_scores_output) {
+            params.outputs.resize(2);
+            params.outputs[1] = convert_data_tensor(impl_param.get_output_layout(1));
+        }
+
+        params.conf = get_sdpa_configuration(impl_param, is_dynamic);
 
         const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset;
         const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset;
@@ -475,26 +577,34 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         if ((stage == PagedAttentionStage::PREFILL || stage == PagedAttentionStage::MIXED) && !is_dynamic)
             params.conf.paged_attention_aligned_seq_len = get_aligned_seq_len(impl_param, stage);
 
+        if (has_scores_output)
+            out_tensor_to_offset_map.insert({1, out_offsets_map.at(1)});
+
         params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map);
 
         return params;
     }
 
-    static pa_sdpa_kernel_params_t get_pa_sdpa_params(const kernel_impl_params& impl_param, const PagedAttentionStage& stage, bool is_dynamic = false) {
+    static pa_sdpa_kernel_params_t get_pa_sdpa_params(const kernel_impl_params& impl_param,
+                                                      const PagedAttentionStage& stage,
+                                                      const kernel_selector::MultiDataTensor& input_tensors,
+                                                      bool is_dynamic = false) {
         const auto desc = impl_param.typed_desc<paged_attention>();
         auto params = get_default_params<pa_sdpa_kernel_params_t>(impl_param, is_dynamic);
 
-        const auto& query_layout = impl_param.get_input_layout(0);
-        const auto& key_cache_layout = impl_param.get_input_layout(3);
-        const auto& value_cache_layout = impl_param.get_input_layout(4);
-        const auto& past_lens_layout = impl_param.get_input_layout(5);
-        const auto& block_indices_layout = impl_param.get_input_layout(7);
-        const auto& block_indices_begins_layout = impl_param.get_input_layout(8);
-        const auto& subsequence_begins_layout = impl_param.get_input_layout(6);
-        const auto& scale_layout = impl_param.get_input_layout(9);
-        const auto& alibi_layout = impl_param.get_input_layout(11);
-        const auto has_alibi = alibi_layout.count() > 0;
+        const auto& query_tensor = input_tensors[0];
+        const auto& key_cache_tensor = input_tensors[3];
+        const auto& value_cache_tensor = input_tensors[4];
+        const auto& past_lens_tensor = input_tensors[5];
+        const auto& block_indices_tensor = input_tensors[7];
+        const auto& block_indices_begins_tensor = input_tensors[8];
+        const auto& subsequence_begins_tensor = input_tensors[6];
+        const auto& scale_tensor = input_tensors[9];
+        const auto& alibi_tensor = input_tensors[11];
+
+        const auto has_alibi = impl_param.get_input_layout(11).count() > 0;
         const auto has_scale_input = !desc->scale_val.has_value();
+        const auto has_scores_output = desc->has_scores_output();
 
         auto inputs_number = 7;
         if (has_scale_input)
@@ -505,28 +615,34 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
 
         auto input_idx = 0;
         params.inputs.resize(inputs_number);
-        params.inputs[input_idx++] = convert_data_tensor(query_layout);
-        params.inputs[input_idx++] = convert_data_tensor(key_cache_layout);
-        params.inputs[input_idx++] = convert_data_tensor(value_cache_layout);
-        params.inputs[input_idx++] = convert_data_tensor(past_lens_layout);
-        params.inputs[input_idx++] = convert_data_tensor(block_indices_layout);
-        params.inputs[input_idx++] = convert_data_tensor(block_indices_begins_layout);
-        params.inputs[input_idx++] = convert_data_tensor(subsequence_begins_layout);
-        params.conf = get_sdpa_configuration(impl_param);
+        params.inputs[input_idx++] = query_tensor;
+        params.inputs[input_idx++] = key_cache_tensor;
+        params.inputs[input_idx++] = value_cache_tensor;
+        params.inputs[input_idx++] = past_lens_tensor;
+        params.inputs[input_idx++] = block_indices_tensor;
+        params.inputs[input_idx++] = block_indices_begins_tensor;
+        params.inputs[input_idx++] = subsequence_begins_tensor;
+
+        params.conf = get_sdpa_configuration(impl_param, is_dynamic);
 
         if (has_scale_input)
-            params.inputs[input_idx++] = convert_data_tensor(scale_layout);
+            params.inputs[input_idx++] = scale_tensor;
 
         if (has_alibi)
-            params.inputs[input_idx++] = convert_data_tensor(alibi_layout);
+            params.inputs[input_idx++] = alibi_tensor;
 
-        params.multi_tokens_mode = stage == PagedAttentionStage::MIXED;
+        if (has_scores_output) {
+            params.outputs.resize(2);
+            params.outputs[1] = convert_data_tensor(impl_param.get_output_layout(1));
+        }
 
-        if ((stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED) && !is_dynamic) {
+        params.stage = stage;
+
+        if (!has_scores_output && !is_dynamic) {
             const auto& input_mem = impl_param.memory_deps;
             const auto max_context_len = input_mem.at(12);
             mem_lock<int32_t, mem_lock_type::read> max_context_len_mem_lock(max_context_len, *impl_param.strm);
-            params.max_context_len = max_context_len_mem_lock[0];
+            params.conf.paged_attention_max_len = max_context_len_mem_lock[0];
         }
 
         const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset;
@@ -552,6 +668,9 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         if (has_alibi)
             in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(11)});
 
+        if (has_scores_output)
+            out_tensor_to_offset_map.insert({1, out_offsets_map.at(1)});
+
         params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map);
 
         return params;
@@ -560,14 +679,20 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
     void update_dispatch_data(const kernel_impl_params& impl_param) override {
         const auto stage = get_paged_attention_stage(impl_param);
 
-        auto kv_cache_update_kernel_params = get_kv_cache_update_kernel_params(impl_param, stage, impl_param.is_dynamic());
+        kernel_selector::MultiDataTensor input_tensors;
+        for (const auto& input_layout : impl_param.input_layouts)
+            input_tensors.emplace_back(convert_data_tensor(input_layout));
+
+        auto kv_cache_update_kernel_params = get_kv_cache_update_kernel_params(impl_param, stage, input_tensors, impl_param.is_dynamic());
         (_kernels_data[Stage::KV_CACHE_UPDATE].update_dispatch_data_func)(kv_cache_update_kernel_params, _kernels_data[Stage::KV_CACHE_UPDATE]);
 
         if (stage == PagedAttentionStage::PREFILL) {
-            auto sdpa_kernel_params = get_sdpa_kernel_params(impl_param, stage, impl_param.is_dynamic());
+            auto sdpa_kernel_params = get_sdpa_kernel_params(impl_param, stage, input_tensors, impl_param.is_dynamic());
             (_kernels_data[Stage::SDPA].update_dispatch_data_func)(sdpa_kernel_params, _kernels_data[Stage::SDPA]);
-        } else if (stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED) {
-            auto pa_sdpa_kernel_params = get_pa_sdpa_params(impl_param, stage, impl_param.is_dynamic());
+        }
+
+        if (stage == PagedAttentionStage::GENERATE || stage == PagedAttentionStage::MIXED || has_scores_output) {
+            auto pa_sdpa_kernel_params = get_pa_sdpa_params(impl_param, stage, input_tensors, impl_param.is_dynamic());
             (_kernels_data[Stage::PA_SDPA].update_dispatch_data_func)(pa_sdpa_kernel_params, _kernels_data[Stage::PA_SDPA]);
         }
     }
@@ -576,20 +701,32 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         std::vector<kernel_selector::kernel_data> kernels_data;
         const auto stage = PagedAttentionStage::UNKNOWN;
 
-        auto kv_cache_update_kernel_params = get_kv_cache_update_kernel_params(impl_param, stage, impl_param.is_dynamic());
+        kernel_selector::MultiDataTensor input_tensors;
+        for (const auto& input_layout : impl_param.input_layouts)
+            input_tensors.emplace_back(convert_data_tensor(input_layout));
+
+        auto kv_cache_update_kernel_params = get_kv_cache_update_kernel_params(impl_param, stage, input_tensors, impl_param.is_dynamic());
         auto& kv_cache_update_kernel_selector = kv_cache_update_kernel_selector_t::Instance();
         kernels_data.push_back(kv_cache_update_kernel_selector.get_best_kernel(kv_cache_update_kernel_params));
 
-        auto sdpa_kernel_params = get_sdpa_kernel_params(impl_param, stage, impl_param.is_dynamic());
+        auto sdpa_kernel_params = get_sdpa_kernel_params(impl_param, stage, input_tensors, impl_param.is_dynamic());
         auto& sdpa_kernel_selector = sdpa_kernel_selector_t::Instance();
         kernels_data.push_back(sdpa_kernel_selector.get_best_kernel(sdpa_kernel_params));
 
-        auto pa_sdpa_kernel_params = get_pa_sdpa_params(impl_param, stage, impl_param.is_dynamic());
+        auto pa_sdpa_kernel_params = get_pa_sdpa_params(impl_param, stage, input_tensors, impl_param.is_dynamic());
         auto& pa_sdpa_kernel_selector = pa_sdpa_kernel_selector_t::Instance();
         kernels_data.push_back(pa_sdpa_kernel_selector.get_best_kernel(pa_sdpa_kernel_params));
 
-        return cldnn::make_unique<paged_attention_impl>(kernels_data);
+        auto impl = cldnn::make_unique<paged_attention_impl>(kernels_data);
+
+        const auto& desc = impl_param.typed_desc<paged_attention>();
+        impl->has_scores_output = desc->has_scores_output();
+
+        return impl;
     }
+
+private:
+    bool has_scores_output = false;
 };
 
 namespace detail {
diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/fake_convert_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/fake_convert_impls.cpp
new file mode 100644
index 00000000000000..991ab5aa12657a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/registry/fake_convert_impls.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "registry.hpp"
+#include "intel_gpu/primitives/fake_convert.hpp"
+#include "primitive_inst.h"
+
+namespace ov {
+namespace intel_gpu {
+
+using namespace cldnn;
+
+const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<fake_convert>::get_implementations() {
+    static const std::vector<std::shared_ptr<ImplementationManager>> impls = {
+        OV_GPU_GET_INSTANCE_CPU(fake_convert, shape_types::static_shape)
+        OV_GPU_GET_INSTANCE_CPU(fake_convert, shape_types::dynamic_shape)
+    };
+
+    return impls;
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/fully_connected_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/fully_connected_impls.cpp
index 6f725150794fb6..6ea9eb33c7421c 100644
--- a/src/plugins/intel_gpu/src/graph/impls/registry/fully_connected_impls.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/registry/fully_connected_impls.cpp
@@ -11,6 +11,10 @@
     #include "impls/onednn/fully_connected_onednn.hpp"
 #endif
 
+#if OV_GPU_WITH_CM
+    #include "impls/cm/impl_example.hpp"
+#endif
+
 namespace ov {
 namespace intel_gpu {
 
@@ -26,6 +30,7 @@ const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<fully
                     return false;
                 return node.get_output_pshape().size() <= 3;
         })
+        OV_GPU_CREATE_INSTANCE_CM(cm::ExampleImplementationManager, shape_types::static_shape)
     };
 
     return impls;
diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp b/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp
index f45d0897f01363..3d57a76e60d87d 100644
--- a/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp
@@ -19,6 +19,7 @@
 #define OV_GPU_WITH_OCL 1
 #define OV_GPU_WITH_COMMON 1
 #define OV_GPU_WITH_CPU 1
+#define OV_GPU_WITH_CM 1
 
 #define COUNT_N(_1, _2, _3, _4, _5, N, ...) N
 #define COUNT(...) EXPAND(COUNT_N(__VA_ARGS__, 5, 4, 3, 2, 1))
@@ -63,6 +64,12 @@
 #    define OV_GPU_CREATE_INSTANCE_SYCL(...)
 #endif
 
+#if OV_GPU_WITH_CM
+#    define OV_GPU_CREATE_INSTANCE_CM(...) EXPAND(CREATE_INSTANCE(__VA_ARGS__))
+#else
+#    define OV_GPU_CREATE_INSTANCE_CM(...)
+#endif
+
 #if OV_GPU_WITH_OCL
 #    define OV_GPU_CREATE_INSTANCE_OCL(...) EXPAND(CREATE_INSTANCE(__VA_ARGS__))
 #    define OV_GPU_GET_INSTANCE_OCL(prim, ...) EXPAND(SELECT(COUNT(__VA_ARGS__), prim, impl_types::ocl, __VA_ARGS__))
@@ -145,6 +152,7 @@ REGISTER_IMPLS(scatter_elements_update);
 REGISTER_IMPLS(shape_of);
 REGISTER_IMPLS(strided_slice);
 REGISTER_IMPLS(tile);
+REGISTER_IMPLS(fake_convert);
 
 REGISTER_DEFAULT_IMPLS(assign, CPU_S, CPU_D);
 REGISTER_DEFAULT_IMPLS(read_value, CPU_S, CPU_D);
diff --git a/src/plugins/intel_gpu/src/graph/include/fake_convert_inst.h b/src/plugins/intel_gpu/src/graph/include/fake_convert_inst.h
new file mode 100644
index 00000000000000..d86c565a5e6b2e
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/include/fake_convert_inst.h
@@ -0,0 +1,55 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "intel_gpu/primitives/fake_convert.hpp"
+#include "primitive_inst.h"
+
+#include <memory>
+#include <string>
+
+namespace cldnn {
+
+template <>
+struct typed_program_node<fake_convert> : public typed_program_node_base<fake_convert> {
+    using parent = typed_program_node_base<fake_convert>;
+    typed_program_node(const std::shared_ptr<fake_convert> prim, program& prog)
+        : parent(prim, prog), destination_type(prim->destination_type) {
+        support_padding_all(true);
+    }
+
+public:
+    using parent::parent;
+
+    program_node& input() const { return get_dependency(0); }
+    program_node& scale() const { return get_dependency(1); }
+    program_node& shift() const { return get_dependency(2); }
+    bool has_shift() const { return (get_dependencies().size() == 3); }
+
+    ov::element::Type get_destination_type() const { return destination_type; }
+
+    std::vector<size_t> get_shape_infer_dependencies() const override { return {}; }
+
+private:
+    ov::element::Type destination_type;
+};
+
+using fake_convert_node = typed_program_node<fake_convert>;
+
+template <>
+class typed_primitive_inst<fake_convert> : public typed_primitive_inst_base<fake_convert> {
+    using parent = typed_primitive_inst_base<fake_convert>;
+    using parent::parent;
+
+public:
+    template<typename ShapeType>
+    static std::vector<layout> calc_output_layouts(fake_convert_node const& /*node*/, const kernel_impl_params& impl_param);
+    static layout calc_output_layout(fake_convert_node const& node, kernel_impl_params const& impl_param);
+    static std::string to_string(fake_convert_node const& node);
+
+    typed_primitive_inst(network& network, fake_convert_node const& node);
+};
+
+using fake_convert_inst = typed_primitive_inst<fake_convert>;
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h b/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h
index a7918ba9c3719c..675d77296aa06b 100644
--- a/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h
@@ -7,14 +7,11 @@
 #include "intel_gpu/primitives/paged_attention.hpp"
 #include "primitive_inst.h"
 
+#include "sdpa/pa_sdpa_kernel_opt.h"
+
 namespace cldnn {
 
-enum PagedAttentionStage {
-    GENERATE = 0,
-    PREFILL = 1,
-    MIXED = 2,
-    UNKNOWN = 3
-};
+using PagedAttentionStage = kernel_selector::PagedAttentionStage;
 
 PagedAttentionStage get_paged_attention_stage(const kernel_impl_params& impl_param);
 
@@ -61,6 +58,9 @@ class typed_primitive_inst<paged_attention> : public typed_primitive_inst_base<p
     memory::ptr block_indices_memory_ptr() const { return input_memory_ptr(7); }
     memory::ptr block_indices_begins_memory_ptr() const { return input_memory_ptr(8); }
     memory::ptr alibi_memory_ptr() const { return input_memory_ptr(11); }
+    memory::ptr rotated_block_indices_memory_ptr() const { return input_memory_ptr(13); }
+    memory::ptr rotation_deltas_memory_ptr() const { return input_memory_ptr(14); }
+    memory::ptr rotation_trig_lut_memory_ptr() const { return input_memory_ptr(15); }
 
     std::shared_ptr<network> prefill_network;
 
diff --git a/src/plugins/intel_gpu/src/graph/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/paged_attention.cpp
index 787fd184f75b6a..c761aaf63799cd 100644
--- a/src/plugins/intel_gpu/src/graph/paged_attention.cpp
+++ b/src/plugins/intel_gpu/src/graph/paged_attention.cpp
@@ -48,14 +48,38 @@ layout paged_attention_inst::calc_output_layout(const paged_attention_node& /*no
 
 template<typename ShapeType>
 std::vector<layout> paged_attention_inst::calc_output_layouts(paged_attention_node const& /*node*/, kernel_impl_params const& impl_param) {
-    auto out_layout = impl_param.get_input_layout(0);
+    auto data_layout = impl_param.get_input_layout(0);
 
     const auto& key_cache_ps = impl_param.get_input_layout(3).get_partial_shape();
     bool valid_block_size = key_cache_ps[3].is_dynamic() || key_cache_ps[3].get_length() == paged_attention::block_size;
     OPENVINO_ASSERT(valid_block_size, "[GPU] Incorrect block size for Paged Attention operation. "
                                       "Expected ", paged_attention::block_size, ", but got ", key_cache_ps[3].get_length());
 
-    return {out_layout};
+    std::vector<layout> output_layouts{ data_layout };
+
+    const auto& desc = impl_param.typed_desc<paged_attention>();
+    if (desc->has_scores_output()) {
+        const auto past_lens_idx = 5;
+        const auto output_dt = data_layout.data_type;
+        if (impl_param.get_input_layout(past_lens_idx).is_static()) {
+            const auto& memory_deps = impl_param.memory_deps;
+            const auto past_lens_mem = memory_deps.at(past_lens_idx);
+            mem_lock<int32_t, mem_lock_type::read> past_lens_mem_lock(past_lens_mem, *impl_param.strm);
+
+            long int total_size = 0;
+            for (size_t i = 0; i < past_lens_mem_lock.size(); i++) {
+                total_size += past_lens_mem_lock[i];
+            }
+
+            total_size += static_cast<long int>(impl_param.get_input_layout(0).get_shape()[0]);
+
+            output_layouts.push_back(layout{ov::PartialShape{total_size}, output_dt, format::bfyx});
+        } else {
+            output_layouts.push_back(layout{ov::PartialShape::dynamic(1), output_dt, format::bfyx});
+        }
+    }
+
+    return output_layouts;
 }
 
 template std::vector<layout>
@@ -81,45 +105,79 @@ std::string paged_attention_inst::to_string(const paged_attention_node& node) {
 }
 
 void paged_attention_inst::on_execute() {
-    auto stage = get_paged_attention_stage(*_impl_params);
+    const auto& desc = _impl_params->typed_desc<paged_attention>();
+    const bool has_scores_output = desc->has_scores_output();
+    const auto stage = get_paged_attention_stage(*_impl_params);
 
-    if (stage == PagedAttentionStage::UNKNOWN ||
-        stage == PagedAttentionStage::GENERATE)
+    if ((stage == PagedAttentionStage::UNKNOWN) ||
+        (stage == PagedAttentionStage::GENERATE && !has_scores_output))
         return;
 
+    auto& stream = get_network().get_stream();
+    const auto past_lens_mem = past_lens_memory_ptr();
+    const auto subsequence_begins_mem = subsequence_begins_memory_ptr();
+    mem_lock<int32_t, mem_lock_type::read> past_lens_mem_lock(past_lens_mem, stream);
+    mem_lock<int32_t, mem_lock_type::read> subsequence_begins_mem_lock(subsequence_begins_mem, stream);
+    std::unique_ptr<mem_lock<int32_t, mem_lock_type::write>> subsequence_offsets_lock = nullptr;
+
+    if (has_scores_output) {
+        const size_t subsequence_offsets_idx = 4;
+
+        OPENVINO_ASSERT(_intermediates_memory.size() > subsequence_offsets_idx,
+                        "[GPU] Unexpected number of intermediates buffers for Paged Attention for scores output calculation");
+
+        auto subsequence_offsets_mem = _intermediates_memory[subsequence_offsets_idx];
+        subsequence_offsets_lock.reset(new mem_lock<int32_t, mem_lock_type::write>(subsequence_offsets_mem, stream));
+    }
+
+    if (stage == PagedAttentionStage::GENERATE) {
+        // For the generate stage it's not necessary to configure any other intermediate
+        // buffers. Simply calculate the offsets and exit
+        size_t subsequence_offsets_acc = 0;
+        for (size_t i = 0; i < subsequence_begins_mem_lock.size() - 1; i++) {
+            const auto past_len = past_lens_mem_lock[i];
+            const auto seq_start = subsequence_begins_mem_lock[i];
+            const auto seq_end = subsequence_begins_mem_lock[i + 1];
+            const auto seq_length = seq_end - seq_start;
+
+            if (subsequence_offsets_lock) {
+                subsequence_offsets_lock->operator[](i) = static_cast<int32_t>(subsequence_offsets_acc);
+                subsequence_offsets_acc += seq_length + past_len;
+            }
+        }
+
+        return;
+    }
+
     OPENVINO_ASSERT(_intermediates_memory.size() >= 3, "Unexpected number of intermediates buffers for Paged Attention at prefill stage");
 
     const auto blocks_indexes_start_idx = 0;
     const auto blocks_indexes_end_idx = 1;
     const auto blocked_gws_subseq_mapping_idx = 2;
 
-    const auto past_lens_mem = past_lens_memory_ptr();
-    auto subsequence_begins_mem = subsequence_begins_memory_ptr();
     auto blocks_indexes_start_mem = _intermediates_memory[blocks_indexes_start_idx];
     auto blocks_indexes_end_mem = _intermediates_memory[blocks_indexes_end_idx];
     auto blocked_gws_subseq_mapping_mem = _intermediates_memory[blocked_gws_subseq_mapping_idx];
 
     OPENVINO_ASSERT(subsequence_begins_mem->get_layout().data_type == data_types::i32);
 
-    auto& stream = get_network().get_stream();
-    mem_lock<int32_t, mem_lock_type::read> past_lens_mem_lock(past_lens_mem, stream);
-    mem_lock<int32_t, mem_lock_type::read> subsequence_begins_mem_lock(subsequence_begins_mem, stream);
     mem_lock<int32_t, mem_lock_type::write> blocks_indexes_start_lock(blocks_indexes_start_mem, stream);
     mem_lock<int32_t, mem_lock_type::write> blocks_indexes_end_lock(blocks_indexes_end_mem, stream);
     mem_lock<int32_t, mem_lock_type::write> blocked_gws_subseq_mapping_mem_lock(blocked_gws_subseq_mapping_mem, stream);
     std::unique_ptr<mem_lock<int32_t, mem_lock_type::write>> sequential_gws_subseq_mapping_lock = nullptr;
 
     if (stage == PagedAttentionStage::MIXED) {
-        const auto sequential_gws_subseq_mapping_idx = 6;
+        const size_t sequential_gws_subseq_mapping_idx = has_scores_output ? 8 : 6;
 
         OPENVINO_ASSERT(_intermediates_memory.size() > sequential_gws_subseq_mapping_idx,
-                        "Unexpected number of intermediates buffers for Paged Attention for mixed stage");
+                        "[GPU] Unexpected number of intermediates buffers for Paged Attention for mixed stage");
 
         auto sequential_gws_subseq_mapping_mem = _intermediates_memory[sequential_gws_subseq_mapping_idx];
         sequential_gws_subseq_mapping_lock.reset(new mem_lock<int32_t, mem_lock_type::write>(sequential_gws_subseq_mapping_mem, stream));
     }
 
     size_t index = 0;
+    size_t subsequence_offsets_acc = 0;
     const auto target_seq_len_block_size = 16; // TODO: Get block size from the impl
     for (size_t i = 0; i < subsequence_begins_mem_lock.size() - 1; i++) {
         const auto past_len = past_lens_mem_lock[i];
@@ -159,6 +217,11 @@ void paged_attention_inst::on_execute() {
                 sequential_gws_subseq_mapping_lock->operator[](idx) = static_cast<int32_t>(i);
             }
         }
+
+        if (subsequence_offsets_lock) {
+            subsequence_offsets_lock->operator[](i) = static_cast<int32_t>(subsequence_offsets_acc);
+            subsequence_offsets_acc += seq_length + past_len;
+        }
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index bdffb9c4980722..c938be22b816ed 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -228,7 +228,8 @@ void program::init_program() {
     if (_task_executor == nullptr)
         _task_executor = program::make_task_executor(_config);
     _kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
-                                                                      kernel_selector::KernelBase::get_db().get_batch_headers()));
+                                                                      kernel_selector::KernelBase::get_db().get_batch_headers(),
+                                                                      kernel_selector::KernelBase::get_db().get_cm_batch_headers()));
 
     _kernels_cache->set_kernels_reuse(get_config().get_property(ov::intel_gpu::hint::enable_kernels_reuse));
 
@@ -1501,6 +1502,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::strided_slice::type_id() &&
             prim.type() != cldnn::region_yolo::type_id() &&
             prim.type() != cldnn::normalize::type_id() &&
+            prim.type() != cldnn::group_normalization::type_id() &&
             prim.type() != cldnn::mvn::type_id() &&
             prim.type() != cldnn::gather::type_id() &&
             prim.type() != cldnn::scatter_nd_update::type_id() &&
@@ -1581,6 +1583,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::deconvolution::type_id() &&
             prim.type() != cldnn::multiclass_nms::type_id() &&
             prim.type() != cldnn::normalize::type_id() &&
+            prim.type() != cldnn::group_normalization::type_id() &&
             prim.type() != cldnn::deconvolution::type_id() &&
             prim.type() != cldnn::unique_count::type_id() &&
             prim.type() != cldnn::unique_gather::type_id() &&
diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
index 2b32423f9ce3a8..0c29c8afb9ff01 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
@@ -11,11 +11,16 @@ file(GLOB_RECURSE LIBRARY_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
   )
+list(FILTER LIBRARY_SRC EXCLUDE REGEX "${CMAKE_CURRENT_SOURCE_DIR}/cm_kernels/.*" )
 
 file(GLOB_RECURSE KERNELS
     "${CMAKE_CURRENT_SOURCE_DIR}/cl_kernels/*.cl"
   )
 
+file(GLOB_RECURSE CM_KERNELS
+  "${CMAKE_CURRENT_SOURCE_DIR}/cm_kernels/*"
+)
+
 # Path which points to root directory where code generated elements are created
 # (specific to build configuration).
 set(CODEGEN_DIR "${CMAKE_CURRENT_BINARY_DIR}/codegen")
@@ -28,8 +33,12 @@ set(CODEGEN_INCDIR  "${CODEGEN_DIR}/include")
 
 set(PRIM_DB "ks_primitive_db.inc")
 set(PRIM_DB_BATCH_HEADERS "ks_primitive_db_batch_headers.inc")
+set(CM_PRIM_DB "ks_cm_primitive_db.inc")
+set(CM_PRIM_DB_BATCH_HEADERS "ks_cm_primitive_db_batch_headers.inc")
 set(CODEGEN_CACHE_SOURCES "${CODEGEN_INCDIR}/${PRIM_DB}"
-                          "${CODEGEN_INCDIR}/${PRIM_DB_BATCH_HEADERS}")
+                          "${CODEGEN_INCDIR}/${PRIM_DB_BATCH_HEADERS}"
+                          "${CODEGEN_INCDIR}/${CM_PRIM_DB}"
+                          "${CODEGEN_INCDIR}/${CM_PRIM_DB_BATCH_HEADERS}")
 
 set(CODEGEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/primitive_db_gen.py")
 # Helping with some generators.
@@ -52,6 +61,22 @@ add_custom_command(OUTPUT "${CODEGEN_INCDIR}/${PRIM_DB}"
   COMMENT "Updating file if the file changed (${PRIM_DB}) ..."
 )
 
+add_custom_command(OUTPUT "${CODEGEN_CACHE_DIR}/${CM_PRIM_DB}"
+  COMMAND "${Python3_EXECUTABLE}" "${CODEGEN_SCRIPT}" -out_path "${CODEGEN_CACHE_DIR}"
+                                                      -out_file_name_prim_db "${CM_PRIM_DB}"
+                                                      -out_file_name_batch_headers "${CM_PRIM_DB_BATCH_HEADERS}"
+                                                      -kernels "${CMAKE_CURRENT_SOURCE_DIR}/cm_kernels" -cm
+  DEPENDS ${CM_KERNELS} "${CODEGEN_SCRIPT}" "${CODEGEN_INCDIR}/${PRIM_DB}"
+  COMMENT "Generating ${CODEGEN_CACHE_DIR}/${CM_PRIM_DB} ..."
+)
+
+add_custom_command(OUTPUT "${CODEGEN_INCDIR}/${CM_PRIM_DB}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${CODEGEN_CACHE_DIR}/${CM_PRIM_DB}" "${CODEGEN_INCDIR}/${CM_PRIM_DB}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${CODEGEN_CACHE_DIR}/${CM_PRIM_DB_BATCH_HEADERS}" "${CODEGEN_INCDIR}/${CM_PRIM_DB_BATCH_HEADERS}"
+  DEPENDS "${CODEGEN_CACHE_DIR}/${CM_PRIM_DB}" "${CM_KERNELS}" "${CODEGEN_SCRIPT}" "${CODEGEN_INCDIR}/${PRIM_DB}"
+  COMMENT "Updating file if the file changed (${CM_PRIM_DB}) ..."
+)
+
 add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${CODEGEN_CACHE_SOURCES})
 
 if(NOT BUILD_SHARED_LIBS)
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
index 542fa69ebc241b..109fa2de9841aa 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
@@ -122,8 +122,8 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
     {
 #endif // SLM_DIV_FACTOR > 1
         vec_t src = 0;
-#if INPUT_LEFTOVERS
-        if ((k + 1) * FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)
+
+        if (INPUT_LEFTOVERS && ((k + 1) * FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM))
         {
             if (k * FEATURE_SLICE_SIZE + sglid < INPUT0_FEATURE_NUM)
             {
@@ -143,7 +143,6 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
             }
         }
         else
-#endif // INPUT_LEFTOVERS
         {
 #if PADDED_INPUT
 #if X_BLOCK_SIZE > 1
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
index 01c8e8853e350d..6a5c9e54a8e904 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -601,8 +601,10 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
                 #endif
                 #if TILE_OFM > 1
                 ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds;
+                acc_tmp[bi][fi] = 0;
                 #else
                 acc[bi] += acc_tmp[bi] * ds;
+                acc_tmp[bi] = 0;
                 #endif
             }
         }
@@ -972,7 +974,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
     // =====================================================================================================================================
     // Main computation loop
     const uint iterations = MAIN_LOOP_ELEMENTS_COUNT / TILE_IFM_ELEMENTS_SIZE;  // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD)
-    // Each sub-group loads 2 Batch 
+    // Each sub-group loads 2 Batch
     uint idx_sglid = (sglid * TILE_K) % TILE_IFM_ELEMENTS_SIZE;       // same index for sglid 0~7 : to tile_k direction
     uint batch_sglid = (sglid * TILE_K) / TILE_IFM_ELEMENTS_SIZE;     // 0 to 1 : to batch direction
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl
index ca5c1ea3646d02..3f5796a30933ac 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl
@@ -147,9 +147,7 @@ inline void (FUNC_NAME)(
         // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes,
         //       but significantly degrades readability and generality of code.
         //       It doesn't also show noticable performance improvement on tested configurations.
-        #if DECOMPRESSION_SCALE_POST_OP
-            ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { };
-        #endif
+        ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { };
 
         unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) {
             #if COMPRESSED_WEIGHTS_INT4
@@ -201,11 +199,7 @@ inline void (FUNC_NAME)(
                 unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) {
                     INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
                     unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
-#if DECOMPRESSION_SCALE_POST_OP
                         ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
-#else
-                        ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
-#endif
                     }
                 }
             }
@@ -240,9 +234,20 @@ inline void (FUNC_NAME)(
                     ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH];
                 #endif
                 ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds;
+                acc_tmp[bi][fi] = 0;
             }
         }
 #endif
+
+#if !DECOMPRESSION_SCALE_POST_OP
+        unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) {
+            unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
+                ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi];
+            }
+        }
+#endif
+
+
     }
     // =====================================================================================================================================
     // Leftovers
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl
index 00c43829d02ea7..7e960afa4b87d3 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl
@@ -44,6 +44,10 @@ KERNEL(pa_sdpa_opt)(
     const __global ALIBI_INPUT_TYPE* alibi_slopes,
 #endif
     __global OUTPUT_TYPE* output,
+#if PAGED_ATTENTION_SCORES_OUTPUT
+    __global SOFTMAX_ACCUMULATOR_TYPE* softmax_results,
+    const __global int* subsequence_offsets,
+#endif
     __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums,
     __global SOFTMAX_ACCUMULATOR_TYPE* max_logits,
     __global OUTPUT_TYPE* tmp_out
@@ -276,6 +280,28 @@ KERNEL(pa_sdpa_opt)(
                 const uint max_logits_offset = exp_sums_offset;
                 max_logits[max_logits_offset] = qk_max;
             }
+
+#if PAGED_ATTENTION_SCORES_OUTPUT
+#if MULTI_TOKENS_PROCESSING
+            const uint subsequence_idx = gws_subseq_mapping[seq_idx];
+            const uint subsequence_start_pos = subsequence_begins[subsequence_idx];
+            const uint subsequence_end_pos = subsequence_begins[subsequence_idx + 1];
+            const bool save_softmax_results = seq_idx == subsequence_end_pos - 1;
+#else
+            const uint subsequence_idx = seq_idx;
+            const bool save_softmax_results = true;
+#endif // MULTI_TOKENS_PROCESSING
+            // PagedAttention is supposed to save only last "row" of the QK matrix multiplication,
+            // so save SEQ_LEN_PARTITION_SIZE elements for each partition
+            if (save_softmax_results) {
+                const uint output_offset = subsequence_idx * HEADS_NUM * total_partitions_num * SEQ_LEN_PARTITION_SIZE +
+                                           head_num_idx * total_partitions_num * SEQ_LEN_PARTITION_SIZE +
+                                           partition_idx * SEQ_LEN_PARTITION_SIZE;
+                for (uint i = sgid * SUBGROUP_SIZE + sglid; i < SEQ_LEN_PARTITION_SIZE; i += SUBGROUPS_PER_WG * SUBGROUP_SIZE) {
+                    softmax_results[output_offset + i] = slm_qk_vals[i];
+                }
+            }
+#endif // PAGED_ATTENTION_SCORES_OUTPUT
         }
     }
 
@@ -370,6 +396,10 @@ KERNEL(pa_sdpa_finalization_stage)(
     const __global INPUT6_TYPE* subsequence_begins,
 #endif
     __global OUTPUT_TYPE* output,
+#if PAGED_ATTENTION_SCORES_OUTPUT
+    __global SOFTMAX_ACCUMULATOR_TYPE* softmax_results,
+    const __global int* subsequence_offsets,
+#endif
     const __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums,
     const __global SOFTMAX_ACCUMULATOR_TYPE* max_logits,
     const __global OUTPUT_TYPE* tmp_out,
@@ -500,3 +530,155 @@ KERNEL(pa_sdpa_finalization_stage)(
 }
 
 #endif
+
+#ifdef SDPA_STAGE_2
+#define MAX_PARTITIONS_NUM 128
+
+REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE)
+KERNEL(pa_sdpa_scores_calculation)(
+    const __global INPUT3_TYPE* past_lens,
+    const __global INPUT6_TYPE* subsequence_begins,
+    __global OUTPUT1_TYPE* scores_output,
+    const __global SOFTMAX_ACCUMULATOR_TYPE* softmax_output,
+    const __global int* subsequence_offsets,
+    const __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums,
+    const __global SOFTMAX_ACCUMULATOR_TYPE* max_logits,
+    const __global OUTPUT_TYPE* tmp_out,
+    const uint is_mixed_mode) {
+    const uint subsequence_idx = get_global_id(2);
+    const uint partition_global_idx = get_global_id(0);
+    const uint local_id = get_local_id(0);
+    const uint partition_idx = get_group_id(0);
+    const uint partition_size = get_local_size(0);
+    const uint max_seq_len = get_global_size(0);
+    const uint partitions_num = get_num_groups(0);
+    const uint sgid = get_sub_group_id();
+    const uint sgid_num = get_num_sub_groups();
+    const uint sglid = get_sub_group_local_id();
+
+    const int subsequence_begin = subsequence_begins[subsequence_idx];
+    const int subsequence_end = subsequence_begins[subsequence_idx + 1];
+    const uint seq_len = (subsequence_end - subsequence_begin) + past_lens[subsequence_idx];
+
+    const uint num_of_partitions = CEIL_DIV(seq_len, partition_size);
+
+    if (partition_idx >= num_of_partitions)
+        return;
+
+    __local SOFTMAX_ACCUMULATOR_TYPE slm_exp_sums[HEADS_NUM];
+    __local SOFTMAX_ACCUMULATOR_TYPE slm_global_exp_sum[HEADS_NUM];
+
+    SOFTMAX_ACCUMULATOR_TYPE total_score = SOFTMAX_ACCUMULATOR_VAL_ZERO;
+    if (seq_len <= partition_size) {
+        // If seq_len is less than the partition size, just reduce the results over the heads
+        for (uint head_idx = 0; head_idx < HEADS_NUM; head_idx++) {
+            const uint input_offset = subsequence_idx * HEADS_NUM * max_seq_len + head_idx * max_seq_len + partition_global_idx;
+            SOFTMAX_ACCUMULATOR_TYPE softmax_value = softmax_output[input_offset];
+            total_score += softmax_value;
+        }
+    } else if (seq_len <= partition_size * MAX_PARTITIONS_NUM) {
+        // Optimized version for longer prompts (up to partition_size * MAX_PARTITIONS_NUM, ~64K tokens)
+
+        // Depending on the previous kernel exp_sums and max_logits might have different structure:
+        // For ordinary 1st and 2nd token kernels, there is only a single entry per subsequence.
+        // However, for mixed mode execution, exp_sums and max_logits include information for all
+        // tokens of each subsequence, but only the last one is needed for score calculation.
+        const uint subsequence_pos = is_mixed_mode ? subsequence_end - 1 : subsequence_idx;
+
+        for (uint head_idx = sgid; head_idx < HEADS_NUM; head_idx += sgid_num) {
+            SOFTMAX_ACCUMULATOR_TYPE max_logit[MAX_PARTITIONS_NUM / SUBGROUP_SIZE];
+            SOFTMAX_ACCUMULATOR_TYPE exp_sum[MAX_PARTITIONS_NUM / SUBGROUP_SIZE];
+
+            const uint exp_sums_offset = subsequence_pos * HEADS_NUM * partitions_num + head_idx * partitions_num;
+            for (int i = 0; i < partitions_num / SUBGROUP_SIZE; i++) {
+                max_logit[i] = max_logits[exp_sums_offset + i * SUBGROUP_SIZE + sglid];
+                exp_sum[i] = exp_sums[exp_sums_offset + i * SUBGROUP_SIZE + sglid];
+            }
+
+            const uint partitions_leftovers = partitions_num % SUBGROUP_SIZE;
+            if (partitions_leftovers != 0) {
+                const uint idx = partitions_num / SUBGROUP_SIZE;
+                max_logit[idx] = sglid >= partitions_leftovers ? SOFTMAX_ACCUMULATOR_VAL_MIN : max_logits[exp_sums_offset + idx * SUBGROUP_SIZE + sglid];
+                exp_sum[idx] = sglid >= partitions_leftovers ? SOFTMAX_ACCUMULATOR_VAL_ZERO : exp_sums[exp_sums_offset + idx * SUBGROUP_SIZE + sglid];
+            }
+
+            SOFTMAX_ACCUMULATOR_TYPE global_max_logit = max_logit[0];
+            for (uint i = 1; i < CEIL_DIV(partitions_num, SUBGROUP_SIZE); i++) {
+                global_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(global_max_logit, max_logit[i]);
+            }
+
+            global_max_logit = sub_group_reduce_max(global_max_logit);
+
+            SOFTMAX_ACCUMULATOR_TYPE global_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO;
+            for (uint i = 0; i < CEIL_DIV(partitions_num, SUBGROUP_SIZE); i++) {
+                SOFTMAX_ACCUMULATOR_TYPE adjusted_exp_sum = exp_sum[i] * native_exp(max_logit[i] - global_max_logit);
+                // slm_exp_sums[head_idx][i * SUBGROUP_SIZE + sglid] = adjusted_exp_sum;
+                if (i * SUBGROUP_SIZE + sglid == partition_idx)
+                    slm_exp_sums[head_idx] = adjusted_exp_sum;
+                global_exp_sum += adjusted_exp_sum;
+            }
+
+            global_exp_sum = sub_group_reduce_add(global_exp_sum);
+
+            slm_global_exp_sum[head_idx] = global_exp_sum;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for (uint head_idx = 0; head_idx < HEADS_NUM; head_idx++) {
+            SOFTMAX_ACCUMULATOR_TYPE adjusted_exp_sum = slm_exp_sums[head_idx];
+            SOFTMAX_ACCUMULATOR_TYPE global_exp_sum = slm_global_exp_sum[head_idx];
+
+            const uint input_offset = subsequence_idx * HEADS_NUM * max_seq_len + head_idx * max_seq_len + partition_global_idx;
+            SOFTMAX_ACCUMULATOR_TYPE softmax_value = softmax_output[input_offset];
+
+            softmax_value = softmax_value * adjusted_exp_sum / global_exp_sum;
+            total_score += softmax_value;
+        }
+    } else {
+        // Non optimized fallback version
+        const uint subsequence_pos = is_mixed_mode ? subsequence_end - 1 : subsequence_idx;
+        for (uint head_idx = 0; head_idx < HEADS_NUM; head_idx++) {
+            SOFTMAX_ACCUMULATOR_TYPE global_max_logit = SOFTMAX_ACCUMULATOR_VAL_MIN;
+            const uint max_logits_base_offset = subsequence_pos * HEADS_NUM * partitions_num + head_idx * partitions_num;
+            for (uint i = 0; i < CEIL_DIV(partitions_num, SUBGROUP_SIZE); i++) {
+                const uint partition_offset = i * SUBGROUP_SIZE + sglid;
+                SOFTMAX_ACCUMULATOR_TYPE max_logit = partition_offset >= partitions_num ? SOFTMAX_ACCUMULATOR_VAL_MIN : max_logits[max_logits_base_offset + partition_offset];
+                global_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(global_max_logit, max_logit);
+            }
+
+            global_max_logit = sub_group_reduce_max(global_max_logit);
+
+            SOFTMAX_ACCUMULATOR_TYPE global_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO;
+            SOFTMAX_ACCUMULATOR_TYPE partition_adjusted_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO;
+            const uint exp_sums_base_offset = subsequence_pos * HEADS_NUM * partitions_num + head_idx * partitions_num;
+            for (uint i = 0; i < CEIL_DIV(partitions_num, SUBGROUP_SIZE); i++) {
+                const uint partition_offset = i * SUBGROUP_SIZE + sglid;
+                SOFTMAX_ACCUMULATOR_TYPE exp_sum = partition_offset >= partitions_num ? SOFTMAX_ACCUMULATOR_VAL_ZERO : exp_sums[exp_sums_base_offset + partition_offset];
+                SOFTMAX_ACCUMULATOR_TYPE max_logit = partition_offset >= partitions_num ? SOFTMAX_ACCUMULATOR_VAL_MIN : max_logits[max_logits_base_offset + partition_offset];
+                SOFTMAX_ACCUMULATOR_TYPE adjusted_exp_sum = exp_sum * native_exp(max_logit - global_max_logit);
+                global_exp_sum += adjusted_exp_sum;
+
+                // Save and broadcast the adjusted exp_sum for the currently being processed partition
+                if (i == partition_idx / SUBGROUP_SIZE)
+                    partition_adjusted_exp_sum = sub_group_broadcast(adjusted_exp_sum, partition_idx % SUBGROUP_SIZE);
+            }
+
+            global_exp_sum = sub_group_reduce_add(global_exp_sum);
+
+            const uint input_offset = subsequence_idx * HEADS_NUM * max_seq_len + head_idx * max_seq_len + partition_global_idx;
+            SOFTMAX_ACCUMULATOR_TYPE softmax_value = softmax_output[input_offset];
+
+            softmax_value = softmax_value * partition_adjusted_exp_sum / global_exp_sum;
+            total_score += softmax_value;
+        }
+    }
+
+    const uint output_offset = subsequence_offsets[subsequence_idx];
+    if (partition_global_idx < seq_len) {
+        scores_output[output_offset + partition_global_idx] = total_score;
+    }
+}
+
+#undef MAX_PARTITIONS_NUM
+#endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl
index 95f0d0ff399a3b..ee27d220e30ce9 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl
@@ -66,10 +66,7 @@ KERNEL (reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx)(
 
 
 #if (TILE_SIZE == DEFAULT_TILE_SIZE)
-    // read
-    INPUTVTYPE read_data = AS_INPUTVTYPE(_sub_group_block_read8((const __global uint*)(input) + input_idx_tile));
-
-    // write
+    // write index
     const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER);
 
     if (F_NO_REMAINDER_CONDITION
@@ -79,13 +76,25 @@ KERNEL (reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx)(
     ) {
         #ifdef X_REMAINDER_SIZE
             if (X_REMAINDER_CONDITION) {
+                // read
+                INPUTVTYPE read_data;
+                for (int j = 0; j < X_REMAINDER_SIZE; ++j) {
+                     read_data[j] = AS_INPUT0_TYPE(_sub_group_block_read((const __global uint*)(input) + input_idx_tile + j * DEFAULT_STRIDE));
+                }
+                // write
                 for (int i = 0 ; i < X_REMAINDER_SIZE; i++) {
                     output[output_idx + i] = TO_OUTPUT_TYPE(read_data[i]);
                 }
             } else {
+                // read
+                INPUTVTYPE read_data = AS_INPUTVTYPE(_sub_group_block_read8((const __global uint*)(input) + input_idx_tile));
+                // write
                 VSTORE(TO_OUTPUTVTYPE(read_data), 0, output + output_idx);
             }
         #else
+            // read
+            INPUTVTYPE read_data = AS_INPUTVTYPE(_sub_group_block_read8((const __global uint*)(input) + input_idx_tile));
+            // write
             VSTORE(TO_OUTPUTVTYPE(read_data), 0, output + output_idx);
         #endif
     }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_bfyx_to_blocked_format.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_bfyx_to_blocked_format.cl
index 45d0ccc5c0933e..2f403b798dea39 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_bfyx_to_blocked_format.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_bfyx_to_blocked_format.cl
@@ -26,6 +26,18 @@
                                         } \
                                     }
 
+#define FUNC_LOAD_LEFTOVERS(inner, outer)    unroll_for (uint lh = 0; lh < outer; ++lh) { \
+                                        const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER); \
+                                        INPUTVTYPE read_data; \
+                                        unroll_for (uint lw = 0; lw < inner; ++lw) { \
+                                            read_data[lw] = input[input_idx + lw]; \
+                                        } \
+                                        unroll_for (uint lw = 0; lw < inner; ++lw) { \
+                                            const uint dst = local_buf_offset + lw; \
+                                            transpose_buf[dst][lh] = read_data[lw]; \
+                                        } \
+                                    }
+
 #define FUNC_VSTORE(loop)           unroll_for (uint lw = 0; lw < loop; ++lw) { \
                                         const uint output_idx = output_idx_tile + (lw * x_pitch); \
                                         VSTORE(TO_OUTPUTVTYPE(transpose_buf[local_buf_offset + lw]), 0, output + output_idx); \
@@ -109,7 +121,15 @@ KERNEL (reorder_data_bfyx_to_blocked_format)(
 
     if (F_NO_REMAINDER_CONDITION) {
         // read and transpose
+#ifdef X_REMAINDER_CONDITION
+        if (X_NO_REMAINDER_CONDITION) {
+            FUNC_VLOAD(TILE_SIZE, TILE_SIZE)
+        } else {
+            FUNC_LOAD_LEFTOVERS(X_REMAINDER_SIZE, TILE_SIZE)
+        }
+#else
         FUNC_VLOAD(TILE_SIZE, TILE_SIZE)
+#endif
 
         // write to ddr
 #ifdef X_REMAINDER_CONDITION
@@ -125,7 +145,15 @@ KERNEL (reorder_data_bfyx_to_blocked_format)(
 #ifdef F_REMAINDER_CONDITION
     else if (F_REMAINDER_CONDITION) {
         // read and transpose
+    #ifdef X_REMAINDER_CONDITION
+        if (X_NO_REMAINDER_CONDITION) {
+            FUNC_VLOAD(TILE_SIZE, F_REMAINDER_SIZE)
+        } else {
+            FUNC_LOAD_LEFTOVERS(X_REMAINDER_SIZE, F_REMAINDER_SIZE)
+        }
+    #else
         FUNC_VLOAD(TILE_SIZE, F_REMAINDER_SIZE)
+    #endif
 
         // write to ddr
     #ifdef X_REMAINDER_CONDITION
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
index 55f87e4189d9fe..cddafe62623d9e 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
@@ -842,6 +842,14 @@ KERNEL(sdpa_opt)(
     const __global int* blocked_indexes_start,
     const __global int* blocked_indexes_end,
     const __global int* gws_seq_indexes_correspondence
+#if PAGED_ATTENTION_SCORES_OUTPUT
+    , __global SOFTMAX_ACCUMULATOR_TYPE* softmax_results
+    , const __global int* subsequence_offsets
+    , __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums
+    , __global SOFTMAX_ACCUMULATOR_TYPE* max_logits
+    , __global OUTPUT_TYPE* tmp_out
+    , const uint aligned_max_context_len
+#endif
 #else
     __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums,
     __global SOFTMAX_ACCUMULATOR_TYPE* max_logits,
@@ -1222,6 +1230,39 @@ KERNEL(sdpa_opt)(
                 slm_qk_vals[sglid * SEQ_LEN_PARTITION_SIZE + sgid * TARGET_SEQ_LEN_BLOCK_SIZE + i] = qk_acc[i];
             }
 
+#if PAGED_ATTENTION_SCORES_OUTPUT
+            const uint subsequence_idx = gws_seq_indexes_correspondence[target_seq_dim];
+            const uint subsequence_end_pos = subsequence_begins[subsequence_idx + 1];
+            const uint block_start_pos = blocked_indexes_start[target_seq_dim];
+            const uint block_end_pos = blocked_indexes_end[target_seq_dim];
+
+            // PagedAttention is supposed to save only last "row" of the QK matrix multiplication,
+            // so save SEQ_LEN_PARTITION_SIZE elements for each partition
+            if (subsequence_end_pos == block_end_pos) {
+                const uint last_row_idx = block_end_pos - block_start_pos - 1;
+                if (sglid == last_row_idx) {
+                    const uint partition_idx = start_partition_idx / SEQ_LEN_PARTITION_SIZE;
+
+                    if (sgid == 0) {
+                        const uint max_partitions_num = aligned_max_context_len / SEQ_LEN_PARTITION_SIZE;
+                        const uint exp_sums_output_offset = subsequence_idx * NUM_HEADS * max_partitions_num +
+                                                            num_heads_dim * max_partitions_num +
+                                                            partition_idx;
+                        exp_sums[exp_sums_output_offset] = exp_sum_new;
+                        max_logits[exp_sums_output_offset] = qk_max_new;
+                    }
+
+                    const uint output_offset = subsequence_idx * NUM_HEADS * aligned_max_context_len +
+                                               num_heads_dim * aligned_max_context_len +
+                                               partition_idx * SEQ_LEN_PARTITION_SIZE + sgid * TARGET_SEQ_LEN_BLOCK_SIZE;
+                    for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) {
+                        softmax_results[output_offset + i] = qk_acc[i];
+                    }
+
+                }
+            }
+#endif
+
             barrier(CLK_LOCAL_MEM_FENCE);
         }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/example.cpp b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/example.cpp
new file mode 100644
index 00000000000000..abee70f6483d17
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/example.cpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+namespace KERNEL_NAME {
+
+#include "include/example_header.h"
+
+extern "C" _GENX_MAIN_ void KERNEL_NAME(svmptr_t x [[type("svmptr_t")]]) {
+    // This kernel prints and exits
+    if (cm_linear_global_id() == 0) {
+        printf("Example CM kernel\n");
+        printf("Pointer address: %p\n", (void*)x);
+
+        // Call function from header
+        print_lws_gws();
+
+        // Check macro from batch header
+#ifdef EXAMPLE_CM_MACRO
+        printf("Batch header included\n");
+#else
+        printf("Batch header not included\n");
+#endif
+    }
+}
+}  // namespace KERNEL_NAME
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/batch_headers/exmaple_batch_header.h b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/batch_headers/exmaple_batch_header.h
new file mode 100644
index 00000000000000..f3f2aa183e88dc
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/batch_headers/exmaple_batch_header.h
@@ -0,0 +1,5 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#define EXAMPLE_CM_MACRO
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/example_header.h b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/example_header.h
new file mode 100644
index 00000000000000..3ce3a33188d0fc
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cm_kernels/include/example_header.h
@@ -0,0 +1,8 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+inline void print_lws_gws() {
+    printf("lws: %d, %d, %d\n", cm_local_size(0), cm_local_size(1), cm_local_size(2));
+    printf("gws: %d, %d, %d\n", cm_group_count(0), cm_group_count(1), cm_group_count(2));
+}
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_cm.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_cm.h
new file mode 100644
index 00000000000000..32744f65bee7e0
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_cm.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_base.h"
+
+namespace kernel_selector {
+
+class KernelBaseCM : public KernelBase {
+public:
+    using KernelBase::KernelBase;
+    virtual ~KernelBaseCM() {}
+
+protected:
+    virtual bool Validate(const Params&) const {
+        return true;
+    }
+    std::shared_ptr<KernelString> GetKernelString(const std::string& kernel_name,
+                                                  const std::pair<std::string, std::string>& jit,
+                                                  const std::string& entry_point) const {
+        std::shared_ptr<KernelString> kernel_string = std::make_shared<KernelString>();
+
+        bool is_cm = true;
+        auto codes = db.get(kernel_name, is_cm);
+
+        if (codes.size()) {
+            kernel_string->str = codes[0];
+            kernel_string->jit = "#include <cm/cm.h>\n#include <cm/cmtl.h>\n";
+            kernel_string->jit += jit.first;
+            kernel_string->undefs = jit.second;
+            kernel_string->options = " -cmc ";
+
+            kernel_string->entry_point = entry_point;
+            kernel_string->batch_compilation = true;
+            kernel_string->language = KernelLanguage::CM;
+        }
+
+        return kernel_string;
+    }
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h
index d9b132ac1dcc43..b55740110b2f28 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h
@@ -46,6 +46,7 @@ namespace kernel_selector {
 
 std::string GetStringEnv(const char* varName);
 
+using KernelLanguage = cldnn::kernel_language;
 using KernelString = cldnn::kernel_string;
 using WorkGroupSizes = cldnn::work_group_sizes;
 using ScalarDescriptor = cldnn::scalar_desc;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
index 6fd074f8d8506d..7150d51ecf1e48 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
@@ -264,6 +264,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
         }
         if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) {
             jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
+        } else {
+            jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 0));
         }
     } else {
         DimensionAccessHelperJit input0_dims(params.inputs[0]);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.cpp
new file mode 100644
index 00000000000000..32719501d937d2
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.cpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "fully_connected_cm_example.h"
+
+namespace kernel_selector {
+KernelsData FullyConnected_cm_example::GetKernelsData(const Params& params) const {
+    if (!Validate(params)) {
+        return {};
+    }
+    auto options = std::string(" -Qxcm_jit_option=-DPASTokenReduction ");
+
+    KernelData kd = KernelData::Default<fully_connected_params>(params, 1);
+    auto& kernel = kd.kernels[0];
+
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
+    kernel.params.workGroups.local = {1, 2, 4};
+    kernel.params.workGroups.global = {1, 4, 8};
+
+    std::string kernel_name = "fully_connected_cm_example";
+    auto jit = std::pair<std::string, std::string>("\n#define KERNEL_NAME " + kernel_name, "#undef KERNEL_NAME");
+    kernel.code.kernelString = GetKernelString("example", jit, kernel_name);
+    kernel.code.kernelString->options += options;
+    kernel.code.kernelString->batch_compilation = true;
+    return {kd};
+}
+KernelsPriority FullyConnected_cm_example::GetKernelsPriority(const Params& params) const {
+    return TUTORIAL_PRIORITY;
+}
+ParamsKey FullyConnected_cm_example::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableInputWeightsType(WeightsType::F16);
+    k.EnableInputWeightsType(WeightsType::UINT8);
+    k.EnableAllInputLayout();
+    k.EnableAllOutputLayout();
+    k.EnableDifferentInputWeightsTypes();
+    k.EnableDifferentTypes();
+    k.EnableBiasPerOutput();
+    k.EnableBiasPerFeature();
+    k.EnableNonBiasTerm();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableQuantization(QuantizationType::SYMMETRIC);
+    k.EnableWeightsCompression();
+    return k;
+}
+bool FullyConnected_cm_example::Validate(const Params& p) const {
+    return true;
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.h
new file mode 100644
index 00000000000000..844f3395bd8430
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_example.h
@@ -0,0 +1,21 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "../fully_connected_params.h"
+#include "kernel_base_cm.h"
+
+namespace kernel_selector {
+class FullyConnected_cm_example : public KernelBaseCM {
+public:
+    FullyConnected_cm_example() : KernelBaseCM("fully_connected_example") {}
+    virtual ~FullyConnected_cm_example() {}
+
+    KernelsData GetKernelsData(const Params& params) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    ParamsKey GetSupportedKey() const override;
+    bool Validate(const Params& p) const override;
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.cpp
new file mode 100644
index 00000000000000..dfc6d4342b1490
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.cpp
@@ -0,0 +1,17 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "fully_connected_cm_kernel_selector.h"
+
+#include "fully_connected_cm_example.h"
+
+namespace kernel_selector {
+fully_connected_cm_kernel_selector::fully_connected_cm_kernel_selector() {
+    Attach<FullyConnected_cm_example>();
+}
+
+KernelsData fully_connected_cm_kernel_selector::GetBestKernels(const Params& params) const {
+    return GetAutoTuneBestKernel(params, KernelType::FULLY_CONNECTED);
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.h
new file mode 100644
index 00000000000000..937d605f9ebad2
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/cm/fully_connected_cm_kernel_selector.h
@@ -0,0 +1,24 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_selector.h"
+
+namespace kernel_selector {
+
+class fully_connected_cm_kernel_selector : public kernel_selector_base {
+public:
+    static fully_connected_cm_kernel_selector& Instance() {
+        static fully_connected_cm_kernel_selector instance_;
+        return instance_;
+    }
+
+    fully_connected_cm_kernel_selector();
+
+    virtual ~fully_connected_cm_kernel_selector() {}
+
+    KernelsData GetBestKernels(const Params& params) const override;
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp
index ddfb491f50278a..ce20f49de597ff 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp
@@ -167,7 +167,7 @@ void KVCacheUpdateKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) const {
 
         const auto indexes_dt = Datatype::INT32;
         const auto target_seq_len_block_size = 16;
-        const auto target_seq_len = prim_params.conf.paged_attention_aligned_seq_len;
+        const auto target_seq_len = std::max(prim_params.conf.paged_attention_aligned_seq_len, static_cast<int64_t>(1));
         const auto indexes_buf_size = CeilDiv(target_seq_len, target_seq_len_block_size) * BytesPerElement(indexes_dt);
 
         kd.internalBufferSizes.clear();
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp
index 63c5e74160f652..909a40d677f535 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "sdpa_kernel_opt.h"
 #include "pa_sdpa_kernel_opt.h"
 
 #include "kernel_selector_params.h"
@@ -15,6 +16,7 @@ enum KernelsTypes {
     MULTI_TOKENS,
     FINALIZATION,
     FINALIZATION_MULTI_TOKENS,
+    SCORES_CALCULATION,
     TOTAL_KERNELS_NUM
 };
 
@@ -35,6 +37,8 @@ static std::string GetKernelName(std::string base_name, KernelsTypes type) {
         kernel_name += "_finalization";
     } else if (type == KernelsTypes::FINALIZATION_MULTI_TOKENS) {
         kernel_name += "_finalization_multi_tokens_seq";
+    } else if (type == KernelsTypes::SCORES_CALCULATION) {
+        kernel_name += "_scores_calculation";
     }
 
     return kernel_name;
@@ -46,10 +50,15 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const {
     }
 
     const auto& params = static_cast<const pa_sdpa_params&>(p);
-    const std::vector<KernelsTypes> kernels_type = { KernelsTypes::SINGLE_TOKEN,
-                                                     KernelsTypes::MULTI_TOKENS,
-                                                     KernelsTypes::FINALIZATION,
-                                                     KernelsTypes::FINALIZATION_MULTI_TOKENS };
+    std::vector<KernelsTypes> kernels_type = { KernelsTypes::SINGLE_TOKEN,
+                                               KernelsTypes::MULTI_TOKENS,
+                                               KernelsTypes::FINALIZATION,
+                                               KernelsTypes::FINALIZATION_MULTI_TOKENS };
+
+    const auto has_scores_output = params.outputs.size() > 1;
+    if (has_scores_output) {
+        kernels_type.push_back(KernelsTypes::SCORES_CALCULATION);
+    }
 
     KernelData kd = KernelData::Default<pa_sdpa_params>(params, kernels_type.size());
     kd.needs_sub_kernels_sync = true;
@@ -65,7 +74,8 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const {
 
         const auto jit = CreateJit(kernel_name, jit_constants, entry_point);
 
-        size_t inputs_num = static_cast<int>(params.inputs.size());
+        int inputs_num = static_cast<int>(params.inputs.size());
+        int outputs_num = 1;
         if (kernel_type == KernelsTypes::SINGLE_TOKEN) {
             // SINGLE_TOKEN kernel doesn't use the subsequence_begins input
             inputs_num -= 1;
@@ -75,6 +85,11 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const {
         } else if (kernel_type == KernelsTypes::FINALIZATION_MULTI_TOKENS) {
             // FINALIZATION_MULTI_TOKENS kernel uses past_lens data input and subsequence_begins
             inputs_num = 2;
+        } else if (kernel_type == KernelsTypes::SCORES_CALCULATION) {
+            // SCORES_CALCULATION kernel uses past_lens data input and subsequence_begins
+            inputs_num = 2;
+            // Output is configured manually to use the second output memory buffer
+            outputs_num = 0;
         }
 
         auto& kernel = kd.kernels[kd_kernels_idx++];
@@ -87,19 +102,33 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const {
                          {},
                          false,
                          false,
-                         static_cast<int>(inputs_num),
+                         inputs_num,
                          GetFusedPrimitiveInputsCount(params),
-                         static_cast<int>(params.outputs.size()),
+                         outputs_num,
                          params.is_shape_agnostic);
 
-        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
-        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
-        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2});
+        if (kernel_type == KernelsTypes::SCORES_CALCULATION) {
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 1});
+        }
+
+        uint32_t internal_buffers_num = 0;
+        if (has_scores_output) {
+            // Intermediate softmax results for scores output calculation and precalculated accumulated
+            // sequence length offsets for each subsequence
+            internal_buffers_num += 2;
+        }
+
+        // Softmax's exp_sums, max_logits and intermediate output
+        internal_buffers_num += 3;
 
         if (kernel_type == KernelsTypes::MULTI_TOKENS || kernel_type == KernelsTypes::FINALIZATION_MULTI_TOKENS) {
             // MULTIPLE_TOKENS kernels needs additional information related to mapping
             // launched kernel instances to subsequence indexes
-            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 3});
+            internal_buffers_num++;
+        }
+
+        for (uint32_t i = 0; i < internal_buffers_num; i++) {
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, i});
         }
 
         if (kernel_type == KernelsTypes::FINALIZATION || kernel_type == KernelsTypes::FINALIZATION_MULTI_TOKENS) {
@@ -108,6 +137,15 @@ KernelsData PagedAttentionSDPAKernelOpt::GetKernelsData(const Params& p) const {
             // Remove unused shape_info argument at finalization stage
             kernel.params.arguments.erase(kernel.params.arguments.begin());
         }
+
+        if (kernel_type == KernelsTypes::SCORES_CALCULATION) {
+            // The scores kernel needs to know if the current execution mode is mixed or ordinary
+            // to configure proper memory access
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0});
+
+            // Remove unused shape_info argument for scores kernel
+            kernel.params.arguments.erase(kernel.params.arguments.begin());
+        }
     }
 
     return {kd};
@@ -173,7 +211,12 @@ JitConstants PagedAttentionSDPAKernelOpt::GetJitConstants(const pa_sdpa_params&
         jit.AddConstant(MakeJitConstant("BROADCAST_GROUP_SIZE", config.group_size));
     }
 
-    auto sdpa_stage = kernel_idx == KernelsTypes::FINALIZATION || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS ? 1 : 0;
+    auto sdpa_stage = 0;
+    if (kernel_idx == KernelsTypes::FINALIZATION || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS) {
+        sdpa_stage = 1;
+    } else if (kernel_idx == KernelsTypes::SCORES_CALCULATION) {
+        sdpa_stage = 2;
+    }
     jit.AddConstant(MakeJitConstant("SDPA_STAGE_" + std::to_string(sdpa_stage), 1));
 
     if (config.has_const_scale_val) {
@@ -190,6 +233,10 @@ JitConstants PagedAttentionSDPAKernelOpt::GetJitConstants(const pa_sdpa_params&
         jit.Merge(MakeTypeJitConstants(params.inputs[alibi_input_idx].GetDType(), "ALIBI_INPUT"));
     }
 
+    if (params.outputs.size() > 1) {
+        jit.AddConstant(MakeJitConstant("PAGED_ATTENTION_SCORES_OUTPUT", 1));
+    }
+
     if (kernel_idx == KernelsTypes::MULTI_TOKENS || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS)
         jit.AddConstant(MakeJitConstant("MULTI_TOKENS_PROCESSING", 1));
 
@@ -203,18 +250,36 @@ CommonDispatchData PagedAttentionSDPAKernelOpt::SetDefault(const pa_sdpa_params&
 
     const auto& input = params.inputs[0];
     if (!input.is_dynamic()) {
-        const size_t sequences_number = input.Batch().v;
-        const size_t num_of_partitions = CeilDiv(params.max_context_len, seq_len_partition_size);
+        const size_t total_tokens = input.Batch().v;
+        const size_t num_of_partitions = CeilDiv(params.conf.paged_attention_max_len, seq_len_partition_size);
         const size_t heads_num = static_cast<size_t>(params.conf.heads_num);
         const size_t head_size = static_cast<size_t>(params.conf.head_size);
 
-        if (kernel_idx == 0) {
-            dispatch_data.gws = { sequences_number,
+        if (kernel_idx == KernelsTypes::SINGLE_TOKEN || kernel_idx == KernelsTypes::MULTI_TOKENS) {
+            dispatch_data.gws = { total_tokens,
                                   heads_num,
                                   head_size * num_of_partitions };
             dispatch_data.lws = { 1, 1, head_size };
+        } else if (kernel_idx == KernelsTypes::SCORES_CALCULATION) {
+            const auto& past_lens = params.inputs[3];
+            const auto subsequences_number = past_lens.Batch().v;
+
+            size_t partition_size = 0;
+            size_t num_of_partitions = 0;
+            if (params.stage == PagedAttentionStage::PREFILL) {
+                partition_size = SDPAKernelOpt::get_seq_len_partition_size(params, params.conf.head_size, 1);
+            } else {
+                partition_size = seq_len_partition_size;
+            }
+
+            num_of_partitions = CeilDiv(params.conf.paged_attention_max_len, partition_size);
+
+            dispatch_data.gws = { partition_size * num_of_partitions,
+                                  1,
+                                  subsequences_number };
+            dispatch_data.lws = { partition_size, 1, 1 };
         } else {
-            dispatch_data.gws = { sequences_number,
+            dispatch_data.gws = { total_tokens,
                                   heads_num,
                                   head_size };
             dispatch_data.lws = { 1, 1, subgroup_size };
@@ -228,30 +293,39 @@ void PagedAttentionSDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) cons
     kd.update_dispatch_data_func = [](const Params& params, KernelData& kd) {
         const auto& prim_params = static_cast<const pa_sdpa_params&>(params);
 
-        const size_t expected_kernels_num = 4;
-        OPENVINO_ASSERT(kd.kernels.size() == expected_kernels_num, "[GPU] Invalid kernels size for update dispatch data func of SDPA kernel");
+        const auto has_scores_output = prim_params.outputs.size() > 1;
+        const auto expected_kernels_num = has_scores_output ? KernelsTypes::TOTAL_KERNELS_NUM : KernelsTypes::TOTAL_KERNELS_NUM - 1;
+        OPENVINO_ASSERT(kd.kernels.size() == static_cast<size_t>(expected_kernels_num),
+                        "[GPU] Invalid kernels size for update dispatch data func of SDPA kernel");
+
+        const auto scores_calc_only = prim_params.stage == PagedAttentionStage::PREFILL && has_scores_output;
+        const auto multi_tokens_mode = prim_params.stage == PagedAttentionStage::MIXED;
 
         auto dispatch_data1 = SetDefault(prim_params, KernelsTypes::SINGLE_TOKEN);
         kd.kernels[KernelsTypes::SINGLE_TOKEN].params.workGroups.global = dispatch_data1.gws;
         kd.kernels[KernelsTypes::SINGLE_TOKEN].params.workGroups.local = dispatch_data1.lws;
-        kd.kernels[KernelsTypes::SINGLE_TOKEN].skip_execution = prim_params.multi_tokens_mode;
+        kd.kernels[KernelsTypes::SINGLE_TOKEN].skip_execution = multi_tokens_mode || scores_calc_only;
 
         kd.kernels[KernelsTypes::MULTI_TOKENS].params.workGroups.global = dispatch_data1.gws;
         kd.kernels[KernelsTypes::MULTI_TOKENS].params.workGroups.local = dispatch_data1.lws;
-        kd.kernels[KernelsTypes::MULTI_TOKENS].skip_execution = !prim_params.multi_tokens_mode;
+        kd.kernels[KernelsTypes::MULTI_TOKENS].skip_execution = !multi_tokens_mode || scores_calc_only;
 
-        const auto& input = prim_params.inputs[0];
-        const size_t sequences_number = input.Batch().v;
-        const size_t num_of_partitions = CeilDiv(prim_params.max_context_len, seq_len_partition_size);
+        size_t partition_size = 0;
+        if (prim_params.stage == PagedAttentionStage::PREFILL) {
+            partition_size = SDPAKernelOpt::get_seq_len_partition_size(params, prim_params.conf.head_size, 1);
+        } else {
+            partition_size = seq_len_partition_size;
+        }
+        const size_t num_of_partitions = CeilDiv(prim_params.conf.paged_attention_max_len, partition_size);
 
         auto dispatch_data2 = SetDefault(prim_params, KernelsTypes::FINALIZATION);
         kd.kernels[KernelsTypes::FINALIZATION].params.workGroups.global = dispatch_data2.gws;
         kd.kernels[KernelsTypes::FINALIZATION].params.workGroups.local = dispatch_data2.lws;
-        kd.kernels[KernelsTypes::FINALIZATION].skip_execution = num_of_partitions == 1 || prim_params.multi_tokens_mode;
+        kd.kernels[KernelsTypes::FINALIZATION].skip_execution = num_of_partitions == 1 || multi_tokens_mode || scores_calc_only;
 
         kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].params.workGroups.global = dispatch_data2.gws;
         kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].params.workGroups.local = dispatch_data2.lws;
-        kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].skip_execution = num_of_partitions == 1 || !prim_params.multi_tokens_mode;
+        kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].skip_execution = num_of_partitions == 1 || !multi_tokens_mode || scores_calc_only;
 
         ScalarDescriptor num_of_partitions_scalar;
         num_of_partitions_scalar.t = ScalarDescriptor::Types::UINT32;
@@ -261,23 +335,63 @@ void PagedAttentionSDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) cons
         kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].params.scalars.resize(1);
         kd.kernels[KernelsTypes::FINALIZATION_MULTI_TOKENS].params.scalars[0] = num_of_partitions_scalar;
 
+        if (has_scores_output) {
+            auto dispatch_data = SetDefault(prim_params, KernelsTypes::SCORES_CALCULATION);
+            kd.kernels[KernelsTypes::SCORES_CALCULATION].params.workGroups.global = dispatch_data.gws;
+            kd.kernels[KernelsTypes::SCORES_CALCULATION].params.workGroups.local = dispatch_data.lws;
+            kd.kernels[KernelsTypes::SCORES_CALCULATION].skip_execution = false;
+
+            ScalarDescriptor is_mixed_mode;
+            is_mixed_mode.t = ScalarDescriptor::Types::UINT32;
+            is_mixed_mode.v.u32 = static_cast<uint32_t>(multi_tokens_mode);
+            kd.kernels[KernelsTypes::SCORES_CALCULATION].params.scalars.resize(1);
+            kd.kernels[KernelsTypes::SCORES_CALCULATION].params.scalars[0] = is_mixed_mode;
+        }
+
+        const auto& input = prim_params.inputs[0];
+        const size_t total_tokens = input.Batch().v;
+
         auto buf_dt_size = BytesPerElement(softmax_acc_dt);
-        auto buf_elements_count = sequences_number * prim_params.conf.heads_num * num_of_partitions;
+        auto buf_elements_count = total_tokens * prim_params.conf.heads_num * num_of_partitions;
         auto buf_size = buf_elements_count * buf_dt_size;
 
         auto tmp_out_dt_size = BytesPerElement(softmax_acc_dt);
-        auto tmp_out_elements_count = sequences_number * prim_params.conf.heads_num * prim_params.conf.head_size * num_of_partitions;
+        auto tmp_out_elements_count = total_tokens * prim_params.conf.heads_num * prim_params.conf.head_size * num_of_partitions;
         auto tmp_out_size = tmp_out_elements_count * tmp_out_dt_size;
 
         kd.internalBufferSizes.clear();
-        kd.internalBufferSizes.push_back(buf_size);
-        kd.internalBufferSizes.push_back(buf_size);
-        kd.internalBufferSizes.push_back(tmp_out_size);
+
+        if (has_scores_output) {
+            const auto& past_lens = prim_params.inputs[3];
+            auto subsequences_number = past_lens.Batch().v;
+            auto softmax_buf_dt_size = BytesPerElement(softmax_acc_dt);
+
+            auto softmax_buf_elements_count = subsequences_number * prim_params.conf.heads_num * num_of_partitions * partition_size;
+            auto softmax_buf_size = softmax_buf_elements_count * softmax_buf_dt_size;
+
+            // Softmax intermediate output
+            kd.internalBufferSizes.push_back(softmax_buf_size);
+            // Precalculated accumulated sequence length offsets for each subsequence
+            kd.internalBufferSizes.push_back(subsequences_number * BytesPerElement(Datatype::INT32));
+
+            if (prim_params.stage == PagedAttentionStage::PREFILL) {
+                // Recalculate buf_size as in case of PREFILL stage it's not needed to allocate buffer per each input token
+                buf_elements_count = subsequences_number * prim_params.conf.heads_num * num_of_partitions;
+                buf_size = buf_elements_count * buf_dt_size;
+
+                // Intermediate tmp output buffer is not used for PREFILL stage
+                tmp_out_size = tmp_out_dt_size;
+            }
+        }
+
+        kd.internalBufferSizes.push_back(buf_size); // softmax exp_sums
+        kd.internalBufferSizes.push_back(buf_size); // softmax max_logits
+        kd.internalBufferSizes.push_back(tmp_out_size); // intermediate output
         kd.internalBufferDataType = softmax_acc_dt;
 
-        if (prim_params.multi_tokens_mode) {
+        if (multi_tokens_mode) {
             auto buf_dt_size = BytesPerElement(Datatype::INT32);
-            auto buf_elements_count = sequences_number;
+            auto buf_elements_count = total_tokens;
             auto buf_size = Align(buf_elements_count * buf_dt_size, BytesPerElement(softmax_acc_dt));
             kd.internalBufferSizes.push_back(buf_size);
         }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.h
index a2456ccd9e2af5..a52571b03691df 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.h
@@ -9,11 +9,17 @@
 
 namespace kernel_selector {
 
+enum PagedAttentionStage {
+    GENERATE = 0,
+    PREFILL = 1,
+    MIXED = 2,
+    UNKNOWN = 3
+};
+
 struct pa_sdpa_params : base_params {
     pa_sdpa_params() : base_params(KernelType::PA_SDPA) {}
 
-    bool multi_tokens_mode = false;
-    size_t max_context_len = 0;
+    PagedAttentionStage stage = PagedAttentionStage::UNKNOWN;
     sdpa_configuration conf;
 };
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h
index 5cd9c384ff2709..8fcc4a16692d6c 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h
@@ -97,6 +97,7 @@ struct sdpa_configuration {
     bool is_paged_attention = false;
     int64_t paged_attention_aligned_seq_len = -1;
     int64_t paged_attention_block_size = 0;
+    int64_t paged_attention_max_len = 0;
     bool has_const_scale_val = false;
     float scale_val = 0.f;
 };
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
index 4e71064efbc895..4c23d4de4fd68d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
@@ -21,38 +21,11 @@ enum KernelsTypes {
 constexpr size_t subgroup_size = 16;
 }  // namespace
 
-static size_t get_sg_number_scale_factor(const sdpa_params& sdpa_params, size_t kernel_type) {
-    const size_t optimal_scale_factor = 2;
-    if (kernel_type == KernelsTypes::MULTI_TOKENS) {
-        if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize) {
-            return optimal_scale_factor;
-        }
-    } else if (kernel_type == KernelsTypes::SINGLE_TOKEN) {
-        if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize &&
-            sdpa_params.conf.head_size * optimal_scale_factor / subgroup_size <= subgroup_size) {
-            return optimal_scale_factor;
-        }
-    }
-
-    return 1;
-}
-
 static size_t get_target_seq_len_block_size() {
     const size_t block_size = 16;
     return block_size;
 }
 
-static size_t get_seq_len_partition_size(const sdpa_params& sdpa_params, size_t kernel_type) {
-    size_t seq_len = 0;
-    if (kernel_type == KernelsTypes::MULTI_TOKENS) {
-        seq_len = sdpa_params.conf.head_size * get_sg_number_scale_factor(sdpa_params, kernel_type);
-    } else {
-        seq_len = 256;
-    }
-
-    return seq_len;
-}
-
 static Datatype get_softmax_acc_type() {
     return Datatype::F32;
 }
@@ -71,7 +44,7 @@ static size_t get_partitions_num(const sdpa_params& sdpa_params, size_t kernel_t
     TransposedDimensionAccessHelperBase dims_k(sdpa_params.inputs[1], sdpa_params.input1_order);
     auto source_seq_len = dims_k.y_dim().v;
 
-    return CeilDiv(source_seq_len, get_seq_len_partition_size(sdpa_params, kernel_type));
+    return CeilDiv(source_seq_len, SDPAKernelOpt::get_seq_len_partition_size(sdpa_params, sdpa_params.conf.head_size, kernel_type));
 }
 
 static std::vector<size_t> get_internal_buffer_sizes(const sdpa_params& sdpa_params, size_t kernel_type) {
@@ -130,6 +103,33 @@ static std::string GetKernelName(std::string base_name, KernelsTypes type, const
     return kernel_name;
 }
 
+size_t SDPAKernelOpt::get_sg_number_scale_factor(const Params& params, size_t head_size, size_t kernel_type) {
+    const size_t optimal_scale_factor = 2;
+    if (kernel_type == KernelsTypes::MULTI_TOKENS) {
+        if (head_size * optimal_scale_factor <= params.engineInfo.maxWorkGroupSize) {
+            return optimal_scale_factor;
+        }
+    } else if (kernel_type == KernelsTypes::SINGLE_TOKEN) {
+        if (head_size * optimal_scale_factor <= params.engineInfo.maxWorkGroupSize &&
+            head_size * optimal_scale_factor / subgroup_size <= subgroup_size) {
+            return optimal_scale_factor;
+        }
+    }
+
+    return 1;
+}
+
+size_t SDPAKernelOpt::get_seq_len_partition_size(const Params& params, size_t head_size, size_t kernel_type) {
+    size_t seq_len = 0;
+    if (kernel_type == KernelsTypes::MULTI_TOKENS) {
+        seq_len = head_size * get_sg_number_scale_factor(params, head_size, kernel_type);
+    } else {
+        seq_len = 256;
+    }
+
+    return seq_len;
+}
+
 ParamsKey SDPAKernelOpt::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::INT8);
@@ -176,14 +176,14 @@ JitConstants SDPAKernelOpt::GetJitConstants(const sdpa_params& params, size_t ke
     const auto& config = params.conf;
     jit.AddConstant(MakeJitConstant("SUBGROUP_SIZE", subgroup_size));
     jit.AddConstant(MakeJitConstant("HEAD_SIZE", config.head_size));
-    jit.AddConstant(MakeJitConstant("SEQ_LEN_PARTITION_SIZE", get_seq_len_partition_size(params, kernel_idx)));
+    jit.AddConstant(MakeJitConstant("SEQ_LEN_PARTITION_SIZE", get_seq_len_partition_size(params, config.head_size, kernel_idx)));
 
     auto target_seq_len_block_size = kernel_idx == KernelsTypes::SINGLE_TOKEN ? 1 : get_target_seq_len_block_size();
     jit.AddConstant(MakeJitConstant("TARGET_SEQ_LEN_BLOCK_SIZE", target_seq_len_block_size));
 
     auto sdpa_stage = kernel_idx == KernelsTypes::FINALIZATION ? 1 : 0;
     jit.AddConstant(MakeJitConstant("SDPA_STAGE_" + std::to_string(sdpa_stage), 1));
-    jit.AddConstant(MakeJitConstant("SG_SCALE_FACTOR", get_sg_number_scale_factor(params, kernel_idx)));
+    jit.AddConstant(MakeJitConstant("SG_SCALE_FACTOR", get_sg_number_scale_factor(params, config.head_size, kernel_idx)));
 
     if (params.conf.is_paged_attention) {
         if (params.conf.has_alibi_input) {
@@ -196,6 +196,10 @@ JitConstants SDPAKernelOpt::GetJitConstants(const sdpa_params& params, size_t ke
         } else {
             jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", 1));
         }
+
+        if (params.outputs.size() > 1) {
+            jit.AddConstant(MakeJitConstant("PAGED_ATTENTION_SCORES_OUTPUT", 1));
+        }
     } else if (params.inputs.size() <= 4) {
         jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE_INV", std::sqrt(static_cast<float>(params.conf.head_size))));
         jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE", 1.0f / std::sqrt(static_cast<float>(params.conf.head_size))));
@@ -218,11 +222,11 @@ CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t k
         if (params.conf.is_paged_attention) {
             OPENVINO_ASSERT(kernel_idx == KernelsTypes::MULTI_TOKENS);
 
-            const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx);
             const size_t heads_num = static_cast<size_t>(params.conf.heads_num);
+            const size_t head_size = static_cast<size_t>(params.conf.head_size);
+            const size_t sg_num_scale = get_sg_number_scale_factor(params, head_size, kernel_idx);
             const size_t target_seq_len_block_size = get_target_seq_len_block_size();
             const size_t target_seq_len = static_cast<size_t>(params.conf.paged_attention_aligned_seq_len);
-            const size_t head_size = static_cast<size_t>(params.conf.head_size);
 
             dispatch_data.gws = { heads_num,
                                   CeilDiv(target_seq_len, target_seq_len_block_size),
@@ -243,13 +247,13 @@ CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t k
         const size_t target_seq_len_block_size = kernel_idx == 1 ? get_target_seq_len_block_size() : 1;
 
         if (kernel_idx == KernelsTypes::SINGLE_TOKEN) {
-            const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx);
+            const size_t sg_num_scale = get_sg_number_scale_factor(params, head_size, kernel_idx);
             dispatch_data.gws = { batch_size * heads_num,
                                   CeilDiv(target_seq_len, target_seq_len_block_size),
                                   head_size * num_of_partitions * sg_num_scale };
             dispatch_data.lws = { 1, 1, head_size * sg_num_scale };
         } else if (kernel_idx == KernelsTypes::MULTI_TOKENS) {
-            const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx);
+            const size_t sg_num_scale = get_sg_number_scale_factor(params, head_size, kernel_idx);
             dispatch_data.gws = { batch_size * heads_num,
                                   CeilDiv(target_seq_len, target_seq_len_block_size),
                                   head_size * sg_num_scale };
@@ -317,7 +321,7 @@ KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const {
                          false,
                          inputs_num,
                          GetFusedPrimitiveInputsCount(params),
-                         static_cast<int>(prim_params.outputs.size()),
+                         1 /* number_of_outputs */,
                          prim_params.is_shape_agnostic);
 
         auto beam_table_idx = prim_params.inputs.size();
@@ -339,6 +343,19 @@ KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const {
         kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
         kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2});
 
+        if (prim_params.conf.is_paged_attention && prim_params.outputs.size() > 1) {
+            // Intermediate buffers for PagedAttention scores calculation:
+            // softmax_results, subsequence_offsets, exp_sums, max_logits, tmp_out
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 3});
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4});
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 5});
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6});
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 7});
+
+            // Scalar used for proper offset calculation of intermediate data buffers
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0});
+        }
+
         const auto buf_sizes = get_internal_buffer_sizes(prim_params, kernel_idx);
         if (!prim_params.conf.is_paged_attention) {
             kd.internalBufferSizes.clear();
@@ -379,6 +396,15 @@ void SDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const {
             kernel_data.kernels[0].params.workGroups.global = dispatch_data.gws;
             kernel_data.kernels[0].params.workGroups.local = dispatch_data.lws;
             kernel_data.kernels[0].skip_execution = false;
+
+            if (prim_params.outputs.size() > 1) {
+                const auto max_seq_len = prim_params.conf.paged_attention_max_len;
+                const auto seq_len_partition_size = get_seq_len_partition_size(params, prim_params.conf.head_size, KernelsTypes::MULTI_TOKENS);
+
+                kernel_data.kernels[0].params.scalars.resize(1);
+                kernel_data.kernels[0].params.scalars[0].t = ScalarDescriptor::Types::UINT32;
+                kernel_data.kernels[0].params.scalars[0].v.u32 = static_cast<uint32_t>(Align(max_seq_len, seq_len_partition_size));
+            }
         } else {
             const auto num_of_partitions = get_partitions_num(prim_params, KernelsTypes::SINGLE_TOKEN);
             const auto buf_sizes = get_internal_buffer_sizes(prim_params, KernelsTypes::SINGLE_TOKEN);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h
index 8d7279f5546112..a4d351498d7075 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h
@@ -17,6 +17,9 @@ class SDPAKernelOpt : public SDPAKernelBase {
     KernelsPriority GetKernelsPriority(const Params& params) const override;
     ParamsKey GetSupportedKey() const override;
 
+    static size_t get_sg_number_scale_factor(const Params& params, size_t head_size, size_t kernel_type);
+    static size_t get_seq_len_partition_size(const Params& params, size_t head_size, size_t kernel_type);
+
 protected:
     bool Validate(const Params& p) const override;
     void GetUpdateDispatchDataFunc(KernelData& kd) const override;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp
index cd8128baff37c9..e9fa5dd675629a 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp
@@ -21,16 +21,28 @@ namespace cache {
 primitive_db::primitive_db()
     : primitives({
 #include "ks_primitive_db.inc"
+      }),
+      cm_primitives({
+#include "ks_cm_primitive_db.inc"
       }),
       batch_headers({
 #include "ks_primitive_db_batch_headers.inc"
+      }),
+      cm_batch_headers({
+#include "ks_cm_primitive_db_batch_headers.inc"
       }) {
 }
 
-std::vector<code> primitive_db::get(const primitive_id& id) const {
+std::vector<code> primitive_db::get(const primitive_id& id, bool is_cm) const {
 #ifndef NDEBUG
     {
-        std::ifstream kernel_file{id + ".cl", std::ios::in | std::ios::binary};
+        std::string filename = id;
+        if (!is_cm) {
+            filename += ".cl";
+        } else {
+            filename += ".cpp";
+        }
+        std::ifstream kernel_file{filename, std::ios::in | std::ios::binary};
         if (kernel_file.is_open()) {
             code ret;
             auto beg = kernel_file.tellg();
@@ -46,7 +58,11 @@ std::vector<code> primitive_db::get(const primitive_id& id) const {
     }
 #endif
     try {
-        const auto codes = primitives.equal_range(id);
+        auto* primitives_ptr = &primitives;
+        if (is_cm) {
+            primitives_ptr = &cm_primitives;
+        }
+        const auto codes = primitives_ptr->equal_range(id);
         std::vector<code> temp;
         std::for_each(codes.first, codes.second, [&](const std::pair<const std::string, std::string>& c) {
             temp.push_back(c.second);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h
index e384f6c9879fb5..5c6987246ce1f4 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h
@@ -21,8 +21,9 @@ using primitive_id = std::string;
 struct primitive_db {
     primitive_db();
 
-    std::vector<code> get(const primitive_id& id) const;
+    std::vector<code> get(const primitive_id& id, bool is_cm = false) const;
     std::map<std::string, code> get_batch_headers() const { return std::move(batch_headers); }
+    std::map<std::string, code> get_cm_batch_headers() const { return std::move(cm_batch_headers); }
 
 private:
     struct case_insensitive_compare {
@@ -35,7 +36,9 @@ struct primitive_db {
         }
     };
     std::multimap<primitive_id, code, case_insensitive_compare> primitives;
+    std::multimap<primitive_id, code, case_insensitive_compare> cm_primitives;
     std::map<std::string, code> batch_headers;
+    std::map<std::string, code> cm_batch_headers;
 };
 
 }  // namespace cache
diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py
index 116844f3bccfc7..393e67f3bdb6aa 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py
+++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py
@@ -6,21 +6,30 @@
 # the trailing characters are a tag to allow multiple primitive implementations
 
 from __future__ import print_function
+from enum import Enum
 import os
 import argparse
 import glob
 import ntpath
 import re
 
-class OpenCL2CHeaders(object):
+class KernelLang(Enum):
+    OCLC = 0
+    CM = 1
+    def header_extension(self):
+        return (".cl", ".h")[self.value]
+    def source_extension(self):
+        return (".cl", ".cpp")[self.value]
+class Kernels2CHeaders(object):
 
-    def __init__(self, kernels_folder, out_path, out_file_name_prim_db, out_file_name_batch_headers):
+    def __init__(self, kernels_folder, out_path, out_file_name_prim_db, out_file_name_batch_headers, kernel_lang):
         self.kernels_folder = os.path.abspath(kernels_folder)
         self.out_path = os.path.abspath(out_path)
         self.out_file_name_prim_db = out_file_name_prim_db
         self.out_file_name_batch_headers = out_file_name_batch_headers
         self.include_files = {}
         self.batch_headers = []
+        self.kernel_lang = kernel_lang
         self.find_and_set_batch_headers()
 
     # NOTE: batch_headers are headers with macros on which the runtime jitter might depend on.
@@ -29,7 +38,7 @@ def __init__(self, kernels_folder, out_path, out_file_name_prim_db, out_file_nam
     # specially for improving the jit compilation performance, i.e.,
     # they are not to be included in each kernel, but to be included only once at the beginning of each batch.
     def find_and_set_batch_headers(self):
-        batch_headers_list = [ntpath.basename(h) for h in glob.glob(os.path.join(self.kernels_folder, "include/batch_headers/*.cl"))]
+        batch_headers_list = [ntpath.basename(h) for h in glob.glob(os.path.join(self.kernels_folder, "include/batch_headers/*" + self.kernel_lang.header_extension()))]
         deps = {}
         for h in batch_headers_list:
             header_file = os.path.abspath(os.path.join(self.kernels_folder, "include/batch_headers", h))
@@ -56,11 +65,11 @@ def topological_sort(self, cur_key, items, stack, res):
 
     def convert(self):
         res = '// This file is autogenerated by primitive_db_gen.py, all changes to this file will be undone\n\n'
-        filelist = glob.glob(os.path.join(self.kernels_folder, "*.cl"))
+        filelist = glob.glob(os.path.join(self.kernels_folder, "*" + self.kernel_lang.source_extension()))
         for filename in filelist:
             #try:
                 print('processing {}'.format(filename))
-                res += self.cl_file_to_str(filename)
+                res += self.kernel_file_to_str(filename)
             #except:
             #    pass
         out_file_name_prim_db = os.path.join(self.out_path, self.out_file_name_prim_db)
@@ -198,8 +207,8 @@ def batch_headers_to_str(self):
         characters = 1  # Newline character above
         res = ""
         for h in self.batch_headers:
-            header_name = h[:h.find('.cl')]
-            res += '{{"{}",\n(std::string) R"(\n'.format(header_name)
+            header_name = h[:h.rfind('.')]
+            res += '{{"{}",\n(std::string) R"-(\n'.format(header_name)
             header_file = os.path.abspath(os.path.join(os.path.dirname(self.kernels_folder + "/include/batch_headers"), "batch_headers/" + h))
             content = []
             with open(header_file) as f:
@@ -208,11 +217,11 @@ def batch_headers_to_str(self):
                 if line.startswith('#include'):
                     continue
                 if (i + 1) % max_lines == 0 or characters + len(line) + 1 > max_characters:
-                    res += ')"\n + (std::string) R"('
+                    res += ')-"\n + (std::string) R"-('
                     characters = 0
                 res += '{}\n'.format(line.rstrip())
                 characters += len(line) + 1
-            res += ')"},\n\n'
+            res += ')-"},\n\n'
         return self.post_process_sources(res)
 
     def post_process_sources(self, content):
@@ -241,10 +250,10 @@ def comment_replacer(match):
 
         return content
 
-    def cl_file_to_str(self, filename):
+    def kernel_file_to_str(self, filename):
         name = ntpath.basename(filename)
         self.include_files[filename] = {}
-        kernel_name = name[:name.find('.cl')]
+        kernel_name = name[:name.rfind('.')]
         res = '{{"{}",\n(std::string) R"__krnl(\n'.format(kernel_name)
         content = self.append_file_content(filename, filename)
         content += self.append_undefs(filename)
@@ -265,16 +274,17 @@ def cl_file_to_str(self, filename):
 
         return res
 
-
 def main():
     ap = argparse.ArgumentParser()
     ap.add_argument('-kernels', required=True, metavar='PATH', help='The absolute path to OpenCL kernels folder')
     ap.add_argument('-out_path', required=True, metavar='PATH', help='The absolute path to dump file')
     ap.add_argument('-out_file_name_prim_db', required=True, metavar='PATH', help='dump file name')
     ap.add_argument('-out_file_name_batch_headers', required=True, metavar='PATH', help='dump file name')
+    ap.add_argument('-cm', required=False, action='store_true', help='Process CM kernel sources instead of ocl c')
     args = ap.parse_args()
 
-    converter = OpenCL2CHeaders(args.kernels, args.out_path, args.out_file_name_prim_db, args.out_file_name_batch_headers)
+    kernel_lang = KernelLang.CM if args.cm else KernelLang.OCLC
+    converter = Kernels2CHeaders(args.kernels, args.out_path, args.out_file_name_prim_db, args.out_file_name_batch_headers, kernel_lang)
     converter.convert()
 
 if __name__ == '__main__':
diff --git a/src/plugins/intel_gpu/src/plugin/ops/fake_convert.cpp b/src/plugins/intel_gpu/src/plugin/ops/fake_convert.cpp
new file mode 100644
index 00000000000000..282a483deab189
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/ops/fake_convert.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/plugin/program_builder.hpp"
+#include "intel_gpu/plugin/common_utils.hpp"
+
+#include "openvino/op/fake_convert.hpp"
+
+#include "intel_gpu/primitives/fake_convert.hpp"
+
+namespace ov {
+namespace intel_gpu {
+static void CreateFakeConvertOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v13::FakeConvert>& op) {
+    validate_inputs_count(op, {2, 3});
+    const auto inputs = p.GetInputInfo(op);
+    const std::string layerName = layer_type_name_ID(op);
+    ov::element::Type destination_type = op->get_destination_element_type();
+    std::shared_ptr<cldnn::fake_convert> fake_convert_prim = nullptr;
+    if (inputs.size() == 2) {
+        fake_convert_prim = std::make_shared<cldnn::fake_convert>(layerName,
+                                        inputs[0],
+                                        inputs[1],
+                                        destination_type);
+    } else {
+        fake_convert_prim = std::make_shared<cldnn::fake_convert>(layerName,
+                                        inputs[0],
+                                        inputs[1],
+                                        inputs[2],
+                                        destination_type);
+    }
+
+    p.add_primitive(*op, fake_convert_prim);
+}
+
+REGISTER_FACTORY_IMPL(v13, FakeConvert);
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp
index 7425b096b6d324..d82d3a66fed7f7 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp
@@ -61,10 +61,13 @@ static void CreatePagedAttentionExtensionOp(ProgramBuilder& p, const std::shared
     OPENVINO_ASSERT(alibi_const != nullptr);
     prim.has_alibi = ov::shape_size(alibi_const->get_output_shape(0)) > 0;
 
+    prim.num_outputs = 1;
     if (op->get_output_size() > 1) {
         const auto scores_output_idx = 1;
         const auto& users = op->get_output_target_inputs(scores_output_idx);
-        OPENVINO_ASSERT(users.size() == 0, "[GPU] PagedAttention implementation doesn't support scores output yet");
+        if (users.size() > 0) {
+            prim.num_outputs++; // Add scores output
+        }
     }
 
     p.add_primitive(*op, prim);
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 53ab9aa188b7aa..7c7c09adcd182f 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -92,6 +92,7 @@
 #include "transformations/common_optimizations/lstm_cell_fusion.hpp"
 #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp"
 #include "transformations/common_optimizations/mvn_fusion.hpp"
+#include "transformations/common_optimizations/sdpa_scale_fusion.hpp"
 #include "transformations/common_optimizations/softmax_fusion.hpp"
 #include "transformations/common_optimizations/glu_fusion.hpp"
 #include "transformations/common_optimizations/transpose_sinking.hpp"
@@ -941,6 +942,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         if (!disable_horizontal_fc_fusion)
             manager.register_pass<ov::pass::ConstantFolding>();
 
+        manager.register_pass<ov::pass::SDPAScaleFusion>();
         manager.register_pass<ov::pass::ConvertGatherToGatherCompressed>();
         auto pass_config = manager.get_pass_config();
         manager.register_pass<ov::intel_gpu::KVCacheFusion>();
diff --git a/src/plugins/intel_gpu/src/runtime/layout.cpp b/src/plugins/intel_gpu/src/runtime/layout.cpp
index a2b7e62ea0cae2..5c6c6dc83aeaea 100644
--- a/src/plugins/intel_gpu/src/runtime/layout.cpp
+++ b/src/plugins/intel_gpu/src/runtime/layout.cpp
@@ -446,8 +446,6 @@ bool layout::compatible(const layout& other) const {
     if (l1.is_dynamic() || l2.is_dynamic())
         return false;
 
-    auto l1_size = l1.get_tensor();
-    auto l2_size = l2.get_tensor();
     if (l1 == l2)
         return true;
     if (check_redundant_1d_along_feature(l1, l2))
@@ -459,7 +457,7 @@ bool layout::compatible(const layout& other) const {
     if (format::is_default_format(l1.format) && format::is_default_format(l2.format) &&
         !l1.data_padding && !l2.data_padding && l1.get_linear_size() == l2.get_linear_size())
         return true;
-    if (l1_size != l2_size)
+    if (l1.get_shape() != l2.get_shape())
         return false;
     if (l1.get_linear_size() != l2.get_linear_size())
         return false;
@@ -505,6 +503,19 @@ bool layout::compatible(const layout& other) const {
     auto l1_pitch = l1.get_pitches();
     auto l2_pitch = l2.get_pitches();
 
+    auto l1_padded_dims = l1.get_padded_dims();
+    auto l2_padded_dims = l2.get_padded_dims();
+
+    // Ignore pitches which will never be used (for padded dims with size == 1)
+    for (size_t i = 0; i < l1_padded_dims.size(); ++i) {
+        if (l1_padded_dims[i] == 1) {
+            l1_pitch[i] = 0;
+        }
+        if (l2_padded_dims[i] == 1) {
+            l2_pitch[i] = 0;
+        }
+    }
+
     auto l1_offset = l1.get_linear_offset();
     auto l2_offset = l2.get_linear_offset();
     if (l1_pitch == l2_pitch && l1_offset == l2_offset)
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/fake_convert.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/fake_convert.cpp
new file mode 100644
index 00000000000000..d1236f5c524421
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/fake_convert.cpp
@@ -0,0 +1,141 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "common_test_utils/file_utils.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/gather.hpp"
+#include "openvino/op/fake_convert.hpp"
+
+namespace {
+
+namespace fp8 {
+constexpr float MAX_F8E4M3  = 448.f;
+constexpr float MAX_F8E5M2  = 57344.f;
+}  // namespace fp8
+
+using namespace std;
+using namespace ov;
+using namespace testing;
+using ov::test::InputShape;
+
+using FakeConvertTestParams = std::tuple<
+                                    ov::Shape,                  // Input shapes
+                                    ov::Shape,                  // Scale shape
+                                    ov::Shape,                  // Shift shape
+                                    ov::element::Type,          // input precision
+                                    ov::element::Type,          // destination type
+                                    std::string >;              // device name
+
+class FakeConvertTest : public testing::WithParamInterface<FakeConvertTestParams>,
+                     virtual public ov::test::SubgraphBaseStaticTest {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<FakeConvertTestParams> obj) {
+        ov::Shape input_shape;
+        ov::Shape scale_shape;
+        ov::Shape shift_shape;
+        ov::element::Type prec;
+        ov::element::Type destination_type;
+        std::string target_device;
+
+        std::tie(input_shape, scale_shape, shift_shape, prec, destination_type, target_device) = obj.param;
+
+        std::ostringstream result;
+        result << "IS=(";
+        result << ov::test::utils::vec2str(input_shape) << "_";
+        result << "scale_shape=" << ov::test::utils::vec2str(scale_shape) << "_";
+        result << "shift_shape=" << ov::test::utils::vec2str(shift_shape) << "_";
+        result << "input_precision=" << prec << "_";
+        result << "destination_type=" << destination_type << "_";
+        result << "device_type=" << target_device;
+        return result.str();
+    }
+
+protected:
+    ov::Shape input_shape, scale_shape, shift_shape;
+    ov::element::Type destination_type;
+
+    void SetUp() override {
+        ov::element::Type prec;
+        std::tie(input_shape, scale_shape, shift_shape, prec, destination_type, targetDevice) = GetParam();
+        const float MAX_FP8 = (destination_type == ov::element::f8e4m3) ? fp8::MAX_F8E4M3 : fp8::MAX_F8E5M2;
+        if (shift_shape.empty()) {
+            auto data = make_shared<op::v0::Parameter>(prec, input_shape);
+            auto scale = op::v0::Constant::create(prec,
+                                                scale_shape,
+                                                {MAX_FP8 / (MAX_FP8 / 2.f),
+                                                1.0f,
+                                                MAX_FP8 / (MAX_FP8 * 3.5f),
+                                                MAX_FP8 / (MAX_FP8 * 4.f)});
+
+            auto op = make_shared<op::v13::FakeConvert>(data, scale, destination_type);
+
+            function = make_shared<Model>(OutputVector{op}, ParameterVector{data});
+        } else {
+            auto data = make_shared<op::v0::Parameter>(prec, input_shape);
+            auto scale = op::v0::Constant::create(prec,
+                                                scale_shape,
+                                                {MAX_FP8 / (MAX_FP8 / 2.f),
+                                                1.0f,
+                                                MAX_FP8 / (MAX_FP8 * 3.5f),
+                                                MAX_FP8 / (MAX_FP8 * 4.f)});
+            auto shift = op::v0::Constant::create(prec, shift_shape, {0.f, 0.f, 0.f, 0.f});
+
+            auto op = make_shared<op::v13::FakeConvert>(data, scale, shift, destination_type);
+
+            function = make_shared<Model>(OutputVector{op}, ParameterVector{data});
+        }
+    }
+
+    void generate_inputs(const std::vector<ov::Shape>& target_shapes) override {
+        inputs.clear();
+        const float MAX_FP8 = (destination_type == ov::element::f8e4m3) ? fp8::MAX_F8E4M3 : fp8::MAX_F8E5M2;
+        const auto& func_inputs = function->inputs();
+        auto& data_input = func_inputs[0];
+        ov::Tensor tensor = ov::Tensor(data_input.get_element_type(), target_shapes[0]);
+        std::vector<float> input_data{MAX_FP8 / 4.f,
+                                    MAX_FP8 / 3.f,
+                                    MAX_FP8 / 2.f,
+                                    MAX_FP8,
+                                    MAX_FP8,
+                                    MAX_FP8,
+                                    MAX_FP8 * 1.2f,
+                                    MAX_FP8 * 2.3f,
+                                    MAX_FP8 * 3.4f,
+                                    MAX_FP8 * 2.f,
+                                    MAX_FP8 * 3.f,
+                                    MAX_FP8 * 4.f};
+        auto* data_ptr = tensor.data<float>();
+        for (size_t i = 0; i < input_data.size(); i++) {
+            data_ptr[i] = input_data[i];
+        }
+        inputs.insert({data_input.get_node_shared_ptr(), tensor});
+    }
+};
+
+TEST_P(FakeConvertTest, Inference) {
+    run();
+}
+
+const std::vector<ov::element::Type> input_precisions = {ov::element::f32};
+
+const std::vector<ov::Shape> input_shapes = {{4, 3}};
+
+const ov::Shape scale_shape = {4, 1};
+const std::vector<ov::Shape> shift_shapes = {{4, 1}, {}};
+const std::vector<ov::element::Type> destination_types = {ov::element::f8e4m3, ov::element::f8e5m2};
+
+INSTANTIATE_TEST_SUITE_P(Smoke_FakeConvertTest,
+                         FakeConvertTest,
+                         ::testing::Combine(::testing::ValuesIn(input_shapes),
+                                            ::testing::Values(scale_shape),
+                                            ::testing::ValuesIn(shift_shapes),
+                                            ::testing::ValuesIn(input_precisions),
+                                            ::testing::ValuesIn(destination_types),
+                                            ::testing::Values<std::string>(ov::test::utils::DEVICE_GPU)),
+                                            FakeConvertTest::getTestCaseName);
+} // namespace
diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp
index a16cd20846a1c7..5dfc450e43905a 100644
--- a/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp
@@ -85,6 +85,7 @@
 #include "intel_gpu/primitives/swiglu.hpp"
 #include "intel_gpu/primitives/tile.hpp"
 #include "intel_gpu/primitives/unique.hpp"
+#include "intel_gpu/primitives/fake_convert.hpp"
 #include "primitive_inst.h"
 #include "test_utils.h"
 
@@ -226,5 +227,6 @@ TEST(registry_test, no_null_impls) {
                 cldnn::unique_count,
                 cldnn::unique_gather,
                 cldnn::scaled_dot_product_attention,
-                cldnn::rope>();
+                cldnn::rope,
+                cldnn::fake_convert>();
 }
diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp
index 7c666819176a13..279a86c73f55bf 100644
--- a/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp
@@ -261,6 +261,10 @@ INSTANTIATE_TEST_SUITE_P(smoke, layout_cmp_test,
          layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::is_os_zyx_isv16_osv16}, false, false},
         {layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::goiyx},
          layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::gioyx}, false, false},
+        {layout{ov::PartialShape{4, 1, 16, 16}, data_types::f16, format::bfyx},
+         layout{ov::PartialShape{4, 1, 16, 16}, data_types::f16, format::byxf}, false, true},
+        {layout{ov::PartialShape{2, 1, 2, 4}, data_types::f16, format::bfyx, padding({0, 0, 1, 0}, {0, 0, 1, 0})},
+         layout{ov::PartialShape{2, 1, 2, 4}, data_types::f16, format::bfyx, padding({0, 1, 0, 0}, {0, 0, 0, 0})}, false, false},
     }));
 
 struct layouts_transform_test_params {
diff --git a/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp
index 9a4cb71450a53c..0eb425b4dc1119 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp
@@ -192,9 +192,9 @@ TEST(add_required_reorders, skip_adding_reorder_batch_axis_padding) {
     crop_prim = network.get_primitive("crop2");
     ASSERT_EQ(crop_prim->can_be_optimized(), true);
     auto reorder_prim = network.get_primitive("crop1_reorder");
-    ASSERT_EQ(reorder_prim->can_be_optimized(), true);
+    ASSERT_EQ(reorder_prim->can_be_optimized(), false);
     reorder_prim = network.get_primitive("crop2_reorder");
-    ASSERT_EQ(reorder_prim->can_be_optimized(), true);
+    ASSERT_EQ(reorder_prim->can_be_optimized(), false);
     auto concate = network.get_primitive("concat");
     ASSERT_EQ(concate->can_be_optimized(), false);
 }
diff --git a/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp
index 493ab79bf8e2cb..ee4382e51645cd 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp
@@ -318,3 +318,108 @@ TEST(mark_shape_of_subgraphs, gather_compressed_no_mark) {
     ASSERT_FALSE(check_subgraph(prog->get_node("shape_of"), prog->get_node("gather_compressed")));
     ASSERT_FALSE(check_subgraph(prog->get_node("shape_of"), prog->get_node("concat")));
 }
+
+TEST(mark_shape_of_subgraphs, broadcast_not_existed_after_shapeof) {
+    auto& engine = get_test_engine();
+    auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()},
+                                       data_types::f32, format::bfyx};
+    auto data_0 = engine.allocate_memory({ ov::PartialShape{4}, data_types::i32, format::bfyx });
+    set_values(data_0, {1, 4, 1, 1});
+    auto weights = engine.allocate_memory({ data_types::f16, format::bfyx, {1152, 4, 1, 1} });
+
+    topology topology;
+    topology.add(input_layout("input", input_layout_dynamic));
+    topology.add(data("data_0", data_0));
+    topology.add(data("weights", weights));
+    topology.add(shape_of("shape_of", input_info("input"), data_types::i32));
+    topology.add(reshape("reshape", input_info("shape_of"), input_info("data_0"), false, {}));
+    topology.add(convolution("convolution", input_info("reshape"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+
+    auto prog = network.get_program();
+    ASSERT_NE(prog, nullptr);
+
+    ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("convolution")));
+}
+
+TEST(mark_shape_of_subgraphs, broadcast_w_data_and_direct_shapeof_no_mark) {
+    auto& engine = get_test_engine();
+    auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()},
+                                       data_types::f32, format::bfyx};
+    auto data_0 = engine.allocate_memory({ ov::PartialShape{1}, data_types::i32, format::bfyx });
+    set_values(data_0, {0});
+    auto weights = engine.allocate_memory({ data_types::f16, format::bfyx, {1152, 4, 2, 2} });
+
+    topology topology;
+    topology.add(input_layout("input", input_layout_dynamic));
+    topology.add(data("data_0", data_0));
+    topology.add(shape_of("shape_of", input_info("input"), data_types::i32));
+    topology.add(broadcast("broadcast", input_info("data_0"), input_info("shape_of"), {}, ov::op::BroadcastType::BIDIRECTIONAL));
+    topology.add(data("weights", weights));
+    topology.add(convolution("convolution", input_info("broadcast"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+
+    auto prog = network.get_program();
+    ASSERT_NE(prog, nullptr);
+
+    ASSERT_FALSE(check_subgraph(prog->get_node("shape_of"), prog->get_node("convolution")));
+    ASSERT_FALSE(check_subgraph(prog->get_node("shape_of"), prog->get_node("broadcast")));
+}
+
+TEST(mark_shape_of_subgraphs, broadcast_w_data_and_indirect_shapeof) {
+    auto& engine = get_test_engine();
+    auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()},
+                                       data_types::f32, format::bfyx};
+    auto data_0 = engine.allocate_memory({ ov::PartialShape{1}, data_types::i32, format::bfyx });
+    set_values(data_0, {0});
+
+    topology topology;
+    topology.add(input_layout("input", input_layout_dynamic));
+    topology.add(data("data_0", data_0));
+    topology.add(shape_of("shape_of", input_info("input"), data_types::i32));
+    topology.add(gather("gather", input_info("shape_of"), input_info("data_0"), 0, 0, {}));
+    topology.add(broadcast("broadcast", input_info("data_0"), input_info("gather"), {}, ov::op::BroadcastType::BIDIRECTIONAL));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+
+    auto prog = network.get_program();
+    ASSERT_NE(prog, nullptr);
+
+    ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("broadcast")));
+}
+
+TEST(mark_shape_of_subgraphs, broadcast_w_direct_shapeof_and_data) {
+    auto& engine = get_test_engine();
+    auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()},
+                                       data_types::f32, format::bfyx};
+    auto target_shape = engine.allocate_memory({ ov::PartialShape{4}, data_types::i32, format::bfyx });
+    set_values(target_shape, {4, 4, 1, 1});
+
+    topology topology;
+    topology.add(input_layout("input", input_layout_dynamic));
+    topology.add(data("target_shape", target_shape));
+    topology.add(shape_of("shape_of", input_info("input"), data_types::i32));
+    topology.add(broadcast("broadcast", input_info("shape_of"), input_info("target_shape"), {}, ov::op::BroadcastType::BIDIRECTIONAL));
+    topology.add(reshape("reshape", input_info("input"), input_info("broadcast"), false, ov::PartialShape{4, 4, 1, 1}));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+
+    auto prog = network.get_program();
+    ASSERT_NE(prog, nullptr);
+
+    ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("broadcast")));
+}
diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
index 456fab4ae0286a..1eb11c662608e0 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@@ -1224,7 +1224,7 @@ TEST(prepare_buffer_fusing, test_implicit_crop_and_outerpadding) {
     auto reorder_prim = network.get_primitive("gather1_reorder");
     ASSERT_EQ(reorder_prim->can_be_optimized(), true);
     reorder_prim = network.get_primitive("gather2_reorder");
-    ASSERT_EQ(reorder_prim->can_be_optimized(), true);
+    ASSERT_EQ(reorder_prim->can_be_optimized(), false);
     auto reshape_prim = network.get_primitive("reshape1");
     ASSERT_EQ(reshape_prim->can_be_optimized(), true);
 }
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
index f0243f055c3670..13934020bfdf66 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
@@ -10820,7 +10820,14 @@ TEST_P(conv_dyn_test, convolution_gpu_fsv16_1x1_no_bias) {
         return outputs_ref.at("conv").get_memory();
     };
 
-    auto in_layout = layout{ov::PartialShape{ov::Dimension(), ov::Dimension(p.in_shape[1]), ov::Dimension(), ov::Dimension()}, data_types::f16, format::b_fs_yx_fsv16};
+    cldnn::layout in_layout;
+    if (p.in_shape[2] % 2 == 0) {
+        // input feature is static
+        in_layout = layout{ov::PartialShape{ov::Dimension(), ov::Dimension(p.in_shape[1]), ov::Dimension(), ov::Dimension()}, data_types::f16, format::b_fs_yx_fsv16};
+    } else {
+        // input feature is dynamic
+        in_layout = layout{ov::PartialShape{ov::Dimension(), ov::Dimension(), ov::Dimension(), ov::Dimension()}, data_types::f16, format::b_fs_yx_fsv16};
+    }
     auto input = engine.allocate_memory({ p.in_shape, data_types::f16, format::b_fs_yx_fsv16 });
     auto weights = engine.allocate_memory({p.wei_shape, data_types::f16, is_grouped ? format::bfzyx : format::bfyx});
 
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
index f59dc5c42cffc1..5bc7e403d3bf74 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -4137,6 +4137,10 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input
     this->test_compressed_int4_scale_dyn_quan(false, true, 511, true);
 }
 
+TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_batch_1) {
+    this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 1, 2048, 3072);
+}
+
 TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_edge_case) {
     this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 359, 1536, 2560);
 }
@@ -4827,3 +4831,52 @@ TEST_F(fully_connected_gpu_tests, weights_reorder_shapes_update) {
 TEST_F(fully_connected_gpu_tests, weights_reorder_shapes_update_cached) {
     this->test_weights_reorder_shapes_update(true);
 }
+
+TEST(fully_connected_gpu, cm) {
+    int min_random = -2, max_random = 2;
+    auto& engine = get_test_engine();
+    ExecutionConfig config = get_test_default_config(engine);
+
+    if (!cldnn::check_cm_jit_support(engine, config)) {
+        GTEST_SKIP();
+    }
+
+    // Test parameters
+    const int batch_num = 2;
+    const int output_f = 4;
+    const int input_x = 1;
+    const int input_y = 1;
+    const int input_f = 3;
+
+    // Allocate memory
+    auto input_prim = engine.allocate_memory({ data_types::f16, format::bfyx, { batch_num, input_f, input_y, input_x } });
+    auto weights_prim = engine.allocate_memory({ data_types::f16, format::oiyx, { output_f, input_f, input_y, input_x } });
+    auto bias_prim = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 1, output_f, 1 } });
+
+    // Generate random input data and set values
+    tests::random_generator rg(GET_SUITE_NAME);
+    auto input_data = rg.generate_random_4d<ov::float16>(batch_num, input_f, input_y, input_x, min_random, max_random);
+    auto weights_data = rg.generate_random_4d<ov::float16>(output_f, input_f, input_y, input_x, min_random, max_random);
+    auto bias_data = rg.generate_random_1d<ov::float16>(output_f, min_random, max_random);
+
+    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
+    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+    set_values(input_prim, input_data_bfyx);
+    set_values(weights_prim, weights_data_bfyx);
+    set_values(bias_prim, bias_data);
+    topology topology(
+        input_layout("input", input_prim->get_layout()),
+        data("weights", weights_prim),
+        data("bias", bias_prim),
+        fully_connected("fc_prim", input_info("input"), "weights", "bias")
+    );
+    ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "", impl_types::cm };
+    config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
+    network network(engine, topology, config);
+    network.set_input_data("input", input_prim);
+    auto outputs = network.execute();
+    ASSERT_EQ(outputs.size(), size_t(1));
+    ASSERT_EQ(outputs.begin()->first, "fc_prim");
+
+    // Do not validate output for CM
+}
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/paged_attention_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/paged_attention_gpu_test.cpp
new file mode 100644
index 00000000000000..a32ef3325cd9bc
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/paged_attention_gpu_test.cpp
@@ -0,0 +1,687 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+#include "random_generator.hpp"
+
+#include <intel_gpu/primitives/data.hpp>
+#include <intel_gpu/primitives/eltwise.hpp>
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/gemm.hpp>
+#include <intel_gpu/primitives/paged_attention.hpp>
+#include <intel_gpu/primitives/permute.hpp>
+#include <intel_gpu/primitives/reorder.hpp>
+#include <intel_gpu/primitives/softmax.hpp>
+
+using namespace cldnn;
+using namespace ov::intel_gpu;
+using namespace ::tests;
+
+/*
+* PagedAttention inputs:
+* [0]: query
+* shape: [batch_size_in_tokens, num_heads * head_size], type: f16
+* [1]: key
+* shape: [batch_size_in_tokens, num_kv_heads * head_size], type: f16
+* [2]: value 
+* shape: [batch_size_in_tokens, num_kv_heads * head_size], type: f16
+* [3]: key_cache
+* shape: [num_blocks, num_kv_heads, head_size, block_size], type: f16
+* [4]: value_cache
+* shape: [num_blocks, num_kv_heads, block_size, head_size], type: f16
+* [5]: past_lens
+* shape: [batch_size_in_sequences], type: i32
+* [6]: subsequence_begins
+* shape: [batch_size_in_sequences + 1], type: i32
+* [7]: block_indices
+* Shape: [num_blocks], type: i32
+* [8]: block_indices_begins
+* Shape: [batch_size_in_sequences + 1], type: i32
+* [9]: scale, optional
+* [10]: sliding_window, optional
+* [11]: alibi_slopes, optional
+* [12]: max_context_len
+* shape: [], type: i32
+*/
+
+struct SubsequenceDescriptor {
+    int num_tokens;
+    int past_len;
+};
+
+struct PagedAttentionManager {
+    int num_heads;
+    int head_size;
+    int block_size;
+    std::vector<SubsequenceDescriptor> subsequence_descs;
+
+    // per-subsequence QKV inputs
+    std::vector<std::vector<ov::float16>> query_data; // {[1, num_tokens, num_heads, head_size], ..}
+    std::vector<std::vector<ov::float16>> key_data;   // {[1, past_len + num_tokens, num_heads, head_size], ..}
+    std::vector<std::vector<ov::float16>> value_data; // {[1, past_len + num_tokens, num_heads, head_size], ..}
+
+    // common PA inputs
+    std::vector<int> past_lens;
+    std::vector<int> subsequence_begins;
+    std::vector<int> block_indices;
+    std::vector<int> block_indices_begins;
+    std::vector<int> max_context_len;
+
+    cldnn::engine& test_engine;
+    cldnn::stream& test_stream;
+    tests::random_generator& rg;
+
+    PagedAttentionManager(tests::random_generator& rg,
+                          cldnn::engine& engine,
+                          cldnn::stream& stream,
+                          const std::vector<SubsequenceDescriptor>& subsequence_descs,
+                          int num_heads,
+                          int head_size,
+                          int block_size)
+        : num_heads(num_heads)
+        , head_size(head_size)
+        , block_size(block_size)
+        , subsequence_descs(subsequence_descs)
+        , test_engine(engine)
+        , test_stream(stream)
+        , rg(rg) {
+        // init subsequence_begins and block_indices_begins
+        subsequence_begins.push_back(0);
+        block_indices_begins.push_back(0);
+
+        int max_len = 0;
+        for (int i = 0; i < static_cast<int>(subsequence_descs.size()); i++) {
+            const auto& subsequence_desc = subsequence_descs[i];
+            max_len = std::max(max_len, subsequence_desc.num_tokens + subsequence_desc.past_len);
+
+            query_data.push_back(generate_input_data(rg, num_heads, subsequence_desc.num_tokens, head_size));
+            key_data.push_back(generate_input_data(rg, num_heads, subsequence_desc.num_tokens + subsequence_desc.past_len, head_size));
+            value_data.push_back(generate_input_data(rg, num_heads, subsequence_desc.num_tokens + subsequence_desc.past_len, head_size));
+
+            past_lens.push_back(subsequence_desc.past_len);
+            int subsequence_start_pos = subsequence_begins[i];
+            int subsequence_end_pos = subsequence_start_pos + subsequence_desc.num_tokens;
+            subsequence_begins.push_back(subsequence_end_pos);
+
+            int subsequence_length = subsequence_desc.num_tokens + subsequence_desc.past_len;
+            int required_blocks = ceil_div(subsequence_length, block_size);
+            int start_block_idx = block_indices.empty() ? 0 : block_indices.back() + 1;
+            int end_block_idx = start_block_idx + required_blocks;
+            for (int block_idx = start_block_idx; block_idx < end_block_idx; block_idx++) {
+                block_indices.push_back(block_idx);
+            }
+
+            int block_indices_start_pos = block_indices_begins[i];
+            int block_indices_end_pos = block_indices_start_pos + required_blocks;
+            block_indices_begins.push_back(block_indices_end_pos);
+        }
+        max_context_len.push_back(max_len);
+    }
+
+    memory::ptr get_query_memory() {
+        return get_QKV_memory(query_data, false);
+    }
+
+    memory::ptr get_key_memory() {
+        return get_QKV_memory(key_data, true);
+    }
+
+    memory::ptr get_value_memory() {
+        return get_QKV_memory(value_data, true);
+    }
+
+    memory::ptr get_key_cache_memory() {
+        auto num_blocks = block_indices.back() + 1;
+        auto key_cache_shape = ov::PartialShape{ num_blocks, num_heads, head_size, block_size };
+        auto key_cache_layout = layout{ key_cache_shape, data_types::f16, format::bfyx };
+        auto memory = test_engine.allocate_memory(key_cache_layout);
+
+        for (int i = 0; i < static_cast<int>(subsequence_descs.size()); i++) {
+            int past_len = subsequence_descs[i].past_len;
+            if (past_len != 0) {
+                int blocks_num = ceil_div(past_len, block_size);
+                int start_block_idx = block_indices[block_indices_begins[i]];
+                for (int block_idx = 0; block_idx < blocks_num; block_idx++) {
+                    int last_token_idx = block_idx == blocks_num - 1 ? past_len % block_size
+                                                                     : block_size;
+                    for (int token_idx = 0; token_idx < last_token_idx; token_idx++) {
+                        for (int head_idx = 0; head_idx < num_heads; head_idx++) {
+                            for (int head_size_idx = 0; head_size_idx < head_size; head_size_idx++) {
+                                size_t input_token_offset = block_idx * block_size + token_idx;
+                                ov::float16* data_ptr = key_data[i].data() +
+                                                        input_token_offset * num_heads * head_size +
+                                                        head_idx * head_size + head_size_idx;
+
+                                // shape: [num_blocks, num_heads, head_size, block_size]
+                                size_t output_offset = (start_block_idx + block_idx) * num_heads * head_size * block_size +
+                                                       head_idx * head_size * block_size +
+                                                       head_size_idx * block_size +
+                                                       token_idx;
+
+                                set_values(test_stream, memory, data_ptr, 1, output_offset);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return memory;
+    }
+
+    memory::ptr get_value_cache_memory() {
+        auto num_blocks = block_indices.back() + 1;
+        auto value_cache_shape = ov::PartialShape{ num_blocks, num_heads, block_size, head_size };
+        auto value_cache_layout = layout{ value_cache_shape, data_types::f16, format::bfyx };
+        auto memory = test_engine.allocate_memory(value_cache_layout);
+
+        for (int i = 0; i < static_cast<int>(subsequence_descs.size()); i++) {
+            int past_len = subsequence_descs[i].past_len;
+            if (past_len != 0) {
+                int blocks_num = ceil_div(past_len, block_size);
+                int start_block_idx = block_indices[block_indices_begins[i]];
+                for (int block_idx = 0; block_idx < blocks_num; block_idx++) {
+                    int last_token_idx = block_idx == blocks_num - 1 ? past_len % block_size
+                                                                     : block_size;
+                    for (int token_idx = 0; token_idx < last_token_idx; token_idx++) {
+                        for (int head_idx = 0; head_idx < num_heads; head_idx++) {
+                            size_t input_token_offset = block_idx * block_size + token_idx;
+                            ov::float16* data_ptr = value_data[i].data() +
+                                                    input_token_offset * num_heads * head_size +
+                                                    head_idx * head_size;
+
+                            // shape: [num_blocks, num_heads, block_size, head_size]
+                            size_t output_offset = (start_block_idx + block_idx) * num_heads * block_size * head_size +
+                                                   head_idx * block_size * head_size +
+                                                   token_idx * head_size;
+
+                            set_values(test_stream, memory, data_ptr, head_size, output_offset);
+                        }
+                    }
+                }
+            }
+        }
+
+        return memory;
+    }
+
+    memory::ptr get_past_lens_memory() {
+        return get_memory_from_vec(past_lens);
+    }
+
+    memory::ptr get_subsequence_begins_memory() {
+        return get_memory_from_vec(subsequence_begins);
+    }
+
+    memory::ptr get_block_indices_memory() {
+        return get_memory_from_vec(block_indices);
+    }
+
+    memory::ptr get_block_indices_begins_memory() {
+        return get_memory_from_vec(block_indices_begins);
+    }
+
+    memory::ptr get_scale_memory() {
+        std::vector<ov::float16> scale = { ov::float16(get_default_scale()) };
+        return get_memory_from_vec(scale);
+    }
+
+    memory::ptr get_sliding_window_memory() {
+        std::vector<int> sliding_window = { 0 };
+        return get_memory_from_vec(sliding_window);
+    }
+
+    memory::ptr get_alibi_memory() {
+        std::vector<ov::float16> alibi;
+        return get_memory_from_vec(alibi);
+    }
+
+    memory::ptr get_max_context_len_memory() {
+        return get_memory_from_vec(max_context_len);
+    }
+
+    float get_default_scale() {
+        return static_cast<float>(1.f / std::sqrt(head_size));
+    }
+
+private:
+    template<typename T>
+    memory::ptr get_memory_from_vec(std::vector<T>& input_data) {
+        auto data_size = input_data.empty() ? 1 : input_data.size();
+        auto shape = ov::PartialShape{ static_cast<int>(data_size) };
+        auto layout = cldnn::layout{ shape, ov::element::from<T>(), format::bfyx };
+        auto memory = test_engine.allocate_memory(layout);
+
+        if (input_data.empty()) {
+            auto shape = ov::PartialShape{0};
+            auto layout = cldnn::layout{ shape, ov::element::from<T>(), format::bfyx };
+            return test_engine.reinterpret_buffer(*memory, layout);
+        }
+
+        set_values(test_stream, memory, input_data.data(), input_data.size(), 0);
+
+        return memory;
+    }
+
+    memory::ptr get_QKV_memory(std::vector<std::vector<ov::float16>>& input_data, bool skip_past_len) {
+        int total_tokens = 0;
+        for (const auto& subsequence_desc : subsequence_descs)
+            total_tokens += subsequence_desc.num_tokens;
+
+        auto query_shape = ov::PartialShape{ total_tokens, num_heads * head_size };
+        auto query_layout = layout{ query_shape, data_types::f16, format::bfyx };
+        auto memory = test_engine.allocate_memory(query_layout);
+
+        for (int subsequence_idx = 0; subsequence_idx < static_cast<int>(subsequence_descs.size()); subsequence_idx++) {
+            for (int token_idx = 0; token_idx < subsequence_descs[subsequence_idx].num_tokens; token_idx++) {
+                for (int head_idx = 0; head_idx < num_heads; head_idx++) {
+                    size_t input_token_offset = token_idx;
+                    // as generated data stored in vectors includes past_len, ignore it for KV inputs
+                    if (skip_past_len)
+                        input_token_offset += subsequence_descs[subsequence_idx].past_len;
+
+                    ov::float16* data_ptr = input_data[subsequence_idx].data() +
+                                            input_token_offset * num_heads * head_size +
+                                            head_idx * head_size;
+
+                    size_t output_token_offset = subsequence_begins[subsequence_idx] + token_idx;
+                    size_t output_offset = output_token_offset * num_heads * head_size +
+                                           head_idx * head_size;
+
+                    set_values(test_stream, memory, data_ptr, head_size, output_offset);
+                }
+            }
+        }
+
+        return memory;
+    }
+
+    template<typename T>
+    static void set_values(stream& stream, memory::ptr mem, T* vals, size_t size, size_t dst_offset) {
+        mem_lock<T> mem_ptr(mem, stream);
+        for (size_t i = 0; i < size; i++) {
+            mem_ptr[dst_offset + i] = vals[i];
+        }
+    }
+
+    static std::vector<ov::float16> generate_input_data(tests::random_generator& rg, size_t num_heads, size_t tokens_num, size_t head_size) {
+        const size_t total_elements_num = tokens_num * num_heads * head_size;
+        auto data = rg.generate_random_1d<ov::float16>(total_elements_num, -1, 1);
+
+        return data;
+    }
+};
+
+struct PagedAttentionReference {
+    PagedAttentionReference(PagedAttentionManager& pam)
+    : pam(pam)
+    , test_engine(pam.test_engine)
+    , test_stream(pam.test_stream) {}
+
+    std::pair<std::vector<ov::float16>, std::vector<ov::float16>> get_reference() {
+        std::vector<ov::float16> ref_data_output;
+        std::vector<ov::float16> ref_scores_output;
+
+        for (size_t i = 0; i < pam.subsequence_descs.size(); i++) {
+            const auto& subsequence_desc = pam.subsequence_descs[i];
+            const auto kv_seq_len = subsequence_desc.num_tokens + subsequence_desc.past_len;
+            auto subsequence_ref_results = run_reference(pam.query_data[i],
+                                                         pam.key_data[i],
+                                                         pam.value_data[i],
+                                                         subsequence_desc.num_tokens,
+                                                         kv_seq_len,
+                                                         pam.num_heads,
+                                                         pam.head_size,
+                                                         pam.get_default_scale());
+
+            // concatenate all subsequences into one vector
+            ref_data_output.insert(ref_data_output.end(),
+                                   subsequence_ref_results.first.begin(),
+                                   subsequence_ref_results.first.end());
+            ref_scores_output.insert(ref_scores_output.end(),
+                                     subsequence_ref_results.second.begin(),
+                                     subsequence_ref_results.second.end());
+        }
+
+        return { ref_data_output, ref_scores_output };
+    }
+
+private:
+    std::pair<std::vector<ov::float16>, std::vector<ov::float16>>
+        run_reference(const std::vector<ov::float16>& query_data,
+                      const std::vector<ov::float16>& key_data,
+                      const std::vector<ov::float16>& value_data,
+                      int num_queries,
+                      int num_keys,
+                      int num_heads,
+                      int head_size,
+                      float scale) {
+        auto query_shape = ov::PartialShape{1, num_queries, num_heads, head_size};
+        auto key_shape = ov::PartialShape{1, num_keys, num_heads, head_size};
+        auto value_shape = ov::PartialShape{1, num_keys, num_heads, head_size};
+
+        auto query_layout = layout{query_shape, data_types::f16, format::bfyx};
+        auto key_layout = layout{key_shape, data_types::f16, format::bfyx};
+        auto value_layout = layout{value_shape, data_types::f16, format::bfyx};
+
+        OPENVINO_ASSERT(query_layout.count() == query_data.size());
+        OPENVINO_ASSERT(key_layout.count() == key_data.size());
+        OPENVINO_ASSERT(value_layout.count() == value_data.size());
+
+        auto query_mem = test_engine.allocate_memory(query_layout);
+        auto key_mem = test_engine.allocate_memory(key_layout);
+        auto value_mem = test_engine.allocate_memory(value_layout);
+        auto mask_mem = get_mask_mem(num_queries, num_keys, num_heads);
+
+        set_values(query_mem, query_data);
+        set_values(key_mem, key_data);
+        set_values(value_mem, value_data);
+
+        topology topology;
+        topology.add(input_layout("query", query_layout),
+                     input_layout("key", key_layout),
+                     input_layout("value", value_layout),
+                     data("mask", mask_mem),
+                     permute("query_transposed", input_info("query"), {0, 2, 1, 3}),
+                     permute("key_transposed", input_info("key"), {0, 2, 1, 3}),
+                     permute("value_transposed", input_info("value"), {0, 2, 1, 3}),
+                     gemm("qk_gemm", { input_info("query_transposed"), input_info("key_transposed") }, data_types::f16, false, true, scale),
+                     eltwise("eltwise", { input_info("qk_gemm"), input_info("mask") }, eltwise_mode::sum),
+                     softmax("softmax", input_info("eltwise"), -1),
+                     gemm("qkv_gemm", { input_info("softmax"), input_info("value_transposed") }, data_types::f16, false, false),
+                     permute("qkv_gemm_transposed", input_info("qkv_gemm"), {0, 2, 1, 3}),
+                     reorder("output_data", input_info("qkv_gemm_transposed"), format::bfyx, data_types::f16),
+                     reorder("scores_data", input_info("softmax"), format::bfyx, data_types::f16)
+        );
+
+        ExecutionConfig config = get_test_default_config(test_engine);
+        config.set_property(ov::intel_gpu::optimize_data(true));
+        config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+        network::ptr network = get_network(test_engine, topology, config, get_test_stream_ptr(), false);
+        network->set_input_data("query", query_mem);
+        network->set_input_data("key", key_mem);
+        network->set_input_data("value", value_mem);
+
+        auto outputs = network->execute();
+
+        auto output_data_mem = outputs.at("output_data").get_memory();
+        auto output_scores_mem = outputs.at("scores_data").get_memory();
+
+        return { get_output_data_vec(output_data_mem, num_queries, head_size, num_heads),
+                 get_output_scores_vec(output_scores_mem, num_queries, num_keys, num_heads) };
+    }
+
+    std::vector<ov::float16> get_output_scores_vec(memory::ptr scores_output,
+                                                   int num_queries,
+                                                   int num_keys,
+                                                   int num_heads) {
+        OPENVINO_ASSERT(scores_output->count() == static_cast<size_t>(num_heads * num_queries * num_keys));
+
+        std::vector<ov::float16> output_scores(num_keys, 0);
+        mem_lock<ov::float16> mem_ptr(scores_output, test_stream);
+        for (int head_idx = 0; head_idx < num_heads; head_idx++) {
+            for (int score_idx = 0; score_idx < num_keys; score_idx++) {
+                output_scores[score_idx] += mem_ptr[head_idx * num_queries * num_keys +
+                                                     (num_queries - 1) * num_keys +
+                                                     score_idx];
+            }
+        }
+
+        return output_scores;
+    }
+
+    std::vector<ov::float16> get_output_data_vec(memory::ptr data_output,
+                                                 int num_queries,
+                                                 int head_size,
+                                                 int num_heads) {
+        OPENVINO_ASSERT(data_output->count() == static_cast<size_t>(num_queries * num_heads * head_size));
+
+        std::vector<ov::float16> output_data(data_output->count());
+        mem_lock<ov::float16> mem_ptr(data_output, test_stream);
+        for (size_t i = 0; i < data_output->count(); i++)
+            output_data[i] = mem_ptr[i];
+
+        return output_data;
+    }
+
+    memory::ptr get_mask_mem(int num_queries, int num_keys, int num_heads) {
+        /*
+        * Two kinds of masks:
+        *
+        * Case 1 (N == K):
+        * num_queries = N
+        * num_keys = K = N
+        * head_size = H
+        * Q  [N, H] * K[H, N]
+        * QK [N, N]
+        *       0    1        N
+        * 0  [  0, MIN, .., MIN ]
+        * 1  [  0,   0, .., MIN ]
+        *    [ ..,  .., .., MIN ]
+        * N  [  0,   0, ..,   0 ]
+        *
+        * Case 2 (N != K):
+        * num_queries = N
+        * num_keys = K
+        * head_size = H
+        * past_len = P = K - N + 1
+        * Q  [N, H] * K[H, K]
+        * QK [N, K]
+        *      0    1    2    P   ..    K
+        * 0 [  0,   0,   0, MIN, MIN, MIN ]
+        * 1 [  0,   0,   0,   0, MIN, MIN ]
+        *   [  .., ..,  ..,  ..,  .., MIN ]
+        * N [  0,   0,   0,   0,  ..,   0 ]
+        *
+        * Shapes:
+        * Q   [1, num_heads, num_queries, head_size]
+        * K   [1, num_heads, head_size, num_keys]
+        * Q*K [1, num_heads, num_queries, num_keys]
+        */
+
+        auto mask_shape = ov::PartialShape{ 1, 1, num_queries, num_keys };
+        auto mask_layout = layout{mask_shape, data_types::f16, format::bfyx};
+        auto mask_mem = test_engine.allocate_memory(mask_layout);
+
+        int past_len = num_keys - num_queries + 1;
+        mem_lock<ov::float16> mem_ptr(mask_mem, test_stream);
+        for (int i = 0; i < num_queries; i++) {
+            for (int j = 0; j < num_keys; j++) {
+                mem_ptr[i * num_keys + j] = j >= past_len + i ? std::numeric_limits<ov::float16>::lowest()
+                                                              : ov::float16(0.f);
+            }
+        }
+
+        return mask_mem;
+    }
+
+
+    PagedAttentionManager& pam;
+    cldnn::engine& test_engine;
+    cldnn::stream& test_stream;
+};
+
+template <typename T>
+struct PagedAttentionTest : public ::testing::TestWithParam<T> {
+public:
+    random_generator rg;
+    cldnn::engine& engine = get_test_engine();
+    float tolerance = 2e-3;
+
+    void SetUp() override {
+        rg.set_seed(GET_SUITE_NAME);
+    }
+
+    void execute(T& p) {
+        PagedAttentionManager pam(rg, get_test_engine(), get_test_stream(), p.subsequences, p.num_heads, p.head_size, p.block_size);
+
+        auto query_mem = pam.get_query_memory();
+        auto key_mem = pam.get_key_memory();
+        auto value_mem = pam.get_value_memory();
+
+        auto key_cache_mem = pam.get_key_cache_memory();
+        auto value_cache_mem = pam.get_value_cache_memory();
+
+        auto past_lens_mem = pam.get_past_lens_memory();
+        auto subsequence_begins_mem = pam.get_subsequence_begins_memory();
+        auto block_indices_mem = pam.get_block_indices_memory();
+        auto block_indices_begins_mem = pam.get_block_indices_begins_memory();
+
+        auto scale_mem = pam.get_scale_memory();
+        auto sliding_window_mem = pam.get_sliding_window_memory();
+        auto alibi_mem = pam.get_alibi_memory();
+        auto max_context_len_mem = pam.get_max_context_len_memory();
+
+        auto query_layout = query_mem->get_layout();
+        auto key_layout = key_mem->get_layout();
+        auto value_layout = value_mem->get_layout();
+        auto key_cache_layout = key_cache_mem->get_layout();
+        auto value_cache_layout = value_cache_mem->get_layout();
+        auto past_lens_layout = past_lens_mem->get_layout();
+        auto subsequence_begins_layout = subsequence_begins_mem->get_layout();
+        auto block_indices_layout = block_indices_mem->get_layout();
+        auto block_indices_begins_layout = block_indices_begins_mem->get_layout();
+        auto scale_layout = scale_mem->get_layout();
+        auto sliding_window_layout = sliding_window_mem->get_layout();
+        auto alibi_layout = alibi_mem->get_layout();
+        auto max_context_len_layout = max_context_len_mem->get_layout();
+
+        // make layouts dynamic
+        query_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads * p.head_size });
+        key_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads * p.head_size });
+        value_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads * p.head_size });
+        key_cache_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads, p.head_size, p.block_size });
+        value_cache_layout.set_partial_shape(ov::PartialShape{ -1, p.num_heads, p.block_size, p.head_size });
+        past_lens_layout.set_partial_shape(ov::PartialShape{ -1 });
+        subsequence_begins_layout.set_partial_shape(ov::PartialShape{ -1 });
+        block_indices_layout.set_partial_shape(ov::PartialShape{ -1 });
+        block_indices_begins_layout.set_partial_shape(ov::PartialShape{ -1 });
+
+        auto pa_prim = paged_attention("paged_attention", { input_info("query"),
+                                                            input_info("key"),
+                                                            input_info("value"),
+                                                            input_info("key_cache"),
+                                                            input_info("value_cache"),
+                                                            input_info("past_lens"),
+                                                            input_info("subsequence_begins"),
+                                                            input_info("block_indices"),
+                                                            input_info("block_indices_begins"),
+                                                            input_info("scale"),
+                                                            input_info("sliding_window"),
+                                                            input_info("alibi"),
+                                                            input_info("max_context_len") });
+
+        pa_prim.head_size = p.head_size;
+        pa_prim.kv_heads_num = p.num_heads;
+        pa_prim.heads_num = p.num_heads;
+        pa_prim.scale_val = pam.get_default_scale();
+        pa_prim.has_alibi = false;
+        pa_prim.num_outputs = p.scores_output ? 2 : 1;
+
+        topology topology;
+        topology.add(
+            input_layout("query", query_layout),
+            input_layout("key", key_layout),
+            input_layout("value", value_layout),
+            input_layout("key_cache", key_cache_layout),
+            input_layout("value_cache", value_cache_layout),
+            input_layout("past_lens", past_lens_layout),
+            input_layout("subsequence_begins", subsequence_begins_layout),
+            input_layout("block_indices", block_indices_layout),
+            input_layout("block_indices_begins", block_indices_begins_layout),
+            input_layout("scale", scale_layout),
+            input_layout("sliding_window", sliding_window_layout),
+            input_layout("alibi", alibi_layout),
+            input_layout("max_context_len", max_context_len_layout),
+            pa_prim,
+            reorder("output_data", input_info("paged_attention", 0), format::bfyx, data_types::f16)
+        );
+
+        if (p.scores_output) {
+            topology.add(reorder("output_scores", input_info("paged_attention", 1), format::bfyx, data_types::f16));
+        }
+
+        ExecutionConfig config = get_test_default_config(get_test_engine());
+        config.set_property(ov::intel_gpu::optimize_data(true));
+        config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+        network::ptr network = get_network(get_test_engine(), topology, config, get_test_stream_ptr(), false);
+        network->set_input_data("query", query_mem);
+        network->set_input_data("key", key_mem);
+        network->set_input_data("value", value_mem);
+        network->set_input_data("key_cache", key_cache_mem);
+        network->set_input_data("value_cache", value_cache_mem);
+        network->set_input_data("past_lens", past_lens_mem);
+        network->set_input_data("subsequence_begins", subsequence_begins_mem);
+        network->set_input_data("block_indices", block_indices_mem);
+        network->set_input_data("block_indices_begins", block_indices_begins_mem);
+        network->set_input_data("scale", scale_mem);
+        network->set_input_data("sliding_window", sliding_window_mem);
+        network->set_input_data("alibi", alibi_mem);
+        network->set_input_data("max_context_len", max_context_len_mem);
+
+        auto outputs = network->execute();
+
+        cldnn::memory::ptr output_data_mem = nullptr;
+        cldnn::memory::ptr output_scores_mem = nullptr;
+
+        output_data_mem = outputs.at("output_data").get_memory();
+        if (p.scores_output) {
+            output_scores_mem = outputs.at("output_scores").get_memory();
+        }
+
+        auto ref_data = PagedAttentionReference(pam).get_reference();
+        compare(output_data_mem, output_scores_mem, ref_data);
+    }
+
+    void compare(memory::ptr data_output_mem, memory::ptr scores_output_mem, std::pair<std::vector<ov::float16>, std::vector<ov::float16>> ref_data) {
+        if (data_output_mem) {
+            ASSERT_EQ(data_output_mem->count(), ref_data.first.size());
+            mem_lock<ov::float16> mem_ptr(data_output_mem, get_test_stream());
+            for (size_t i = 0; i < data_output_mem->count(); i++) {
+                ASSERT_NEAR(mem_ptr[i], ref_data.first[i], tolerance);
+            }
+        }
+
+        if (scores_output_mem) {
+            ASSERT_EQ(scores_output_mem->count(), ref_data.second.size());
+            mem_lock<ov::float16> mem_ptr(scores_output_mem, get_test_stream());
+            for (size_t i = 0; i < scores_output_mem->count(); i++) {
+                ASSERT_NEAR(mem_ptr[i], ref_data.second[i], tolerance);
+            }
+        }
+    }
+};
+
+struct paged_attention_test_params {
+    std::vector<SubsequenceDescriptor> subsequences;
+    int num_heads;
+    int head_size;
+    int block_size;
+    bool scores_output;
+};
+
+class paged_attention_test : public PagedAttentionTest<paged_attention_test_params> {};
+TEST_P(paged_attention_test, basic) {
+    auto p = GetParam();
+
+    execute(p);
+}
+
+INSTANTIATE_TEST_SUITE_P(smoke_paged_attention, paged_attention_test, ::testing::ValuesIn(std::vector<paged_attention_test_params>{
+    /* with scores output */
+    paged_attention_test_params{ {{10, 0}}, 2, 64, 16, true }, // 1st token
+    paged_attention_test_params{ {{36, 0}}, 2, 64, 16, true }, // 1st token
+    paged_attention_test_params{ {{1024, 0}}, 2, 64, 16, true }, // 1st token long
+    paged_attention_test_params{ {{10, 0}, {30, 0}}, 2, 64, 16, true }, // 1st token + 1st token
+    paged_attention_test_params{ {{128, 0}, {256, 0}}, 2, 64, 16, true }, // 1st token + 1st token
+    paged_attention_test_params{ {{1, 10}}, 2, 64, 16, true }, // 2nd token
+    paged_attention_test_params{ {{1, 34}, {1, 515}}, 2, 64, 16, true }, // 2nd token + 2nd token
+    paged_attention_test_params{ {{1, 34}, {25, 0}, {10, 34}}, 2, 64, 16, true }, // mixed: 2nd token + 1st token + part of 1st token
+    /* without scores output */
+    paged_attention_test_params{ {{10, 0}}, 2, 64, 16, false }, // 1st token
+    paged_attention_test_params{ {{1024, 0}}, 2, 64, 16, false }, // 1st token long
+    paged_attention_test_params{ {{1, 34}, {1, 515}}, 2, 64, 16, false }, // 2nd token + 2nd token
+}));
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
index 8ade3b6c8e0f31..0f9f119f275a78 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
@@ -2467,6 +2467,99 @@ TEST(reorder_gpu_f32, bfzyx_to_bsv16_fsv16_padded)
     }
 }
 
+TEST(reorder_gpu_f32, bfzyx_to_bfyx_padded) {
+    tests::random_generator rg(GET_SUITE_NAME);
+    auto& engine = get_test_engine();
+
+    const int32_t b_in = 1024;
+    const int32_t f_in = 64;
+    const int32_t x_in = 72;
+    const int32_t y_in = 2;
+    const int32_t z_in = 3;
+
+    const int32_t b_crop = 1024;
+    const int32_t f_crop = 64;
+    const int32_t x_crop = 72;
+    const int32_t y_crop = 2;
+    const int32_t z_crop = 1;
+
+    const int32_t z0_off = 0;
+    const int32_t z1_off = 1;
+    const int32_t z2_off = 2;
+
+    auto input = engine.allocate_memory({ data_types::f32,format::bfzyx,{ b_in, f_in, x_in, y_in, z_in } });
+
+    topology topology;
+    topology.add(input_layout("input", input->get_layout()));
+    topology.add(crop("crop0", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z0_off }));
+    topology.add(crop("crop1", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z1_off }));
+    topology.add(crop("crop2", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z2_off }));
+    topology.add(reorder("reorder0", input_info("crop0"), format::bfyx, data_types::f32));
+    topology.add(reorder("reorder1", input_info("crop1"), format::bfyx, data_types::f32));
+    topology.add(reorder("reorder2", input_info("crop2"), format::bfyx, data_types::f32));
+    topology.add(reshape("reshape0", input_info("reorder0"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in))));
+    topology.add(reshape("reshape1", input_info("reorder1"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in))));
+    topology.add(reshape("reshape2", input_info("reorder2"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in))));
+
+    std::vector<float> input_vec = rg.generate_random_1d<float>(input->count(), -10, 10);
+    set_values(input, input_vec);
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+
+    network.set_input_data("input", input);
+    auto outputs = network.execute();
+    auto output0 = outputs.at("reshape0").get_memory();
+    auto output1 = outputs.at("reshape1").get_memory();
+    auto output2 = outputs.at("reshape2").get_memory();
+
+    cldnn::mem_lock<float> output_ptr0(output0, get_test_stream());
+    for (int b = 0; b < b_crop; ++b) {
+        for (int f = 0; f < f_crop; ++f) {
+            for (int z = 0; z < z_crop; ++z) {
+                for (int y = 0; y < y_crop; ++y) {
+                    for (int x = 0; x < x_crop; ++x) {
+                        int linear_id = x + x_in * (y + y_in * (z + z0_off + z_in * (f + f_in * b)));
+                        int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b)));
+                        ASSERT_EQ(output_ptr0[output_linear_id], input_vec[linear_id]);
+                    }
+                }
+            }
+        }
+    }
+
+    cldnn::mem_lock<float> output_ptr1(output1, get_test_stream());
+    for (int b = 0; b < b_crop; ++b) {
+        for (int f = 0; f < f_crop; ++f) {
+            for (int z = 0; z < z_crop; ++z) {
+                for (int y = 0; y < y_crop; ++y) {
+                    for (int x = 0; x < x_crop; ++x) {
+                        int linear_id = x + x_in * (y + y_in * (z + z1_off + z_in * (f + f_in * b)));
+                        int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b)));
+                        ASSERT_EQ(output_ptr1[output_linear_id], input_vec[linear_id]);
+                    }
+                }
+            }
+        }
+    }
+
+    cldnn::mem_lock<float> output_ptr2(output2, get_test_stream());
+    for (int b = 0; b < b_crop; ++b) {
+        for (int f = 0; f < f_crop; ++f) {
+            for (int z = 0; z < z_crop; ++z) {
+                for (int y = 0; y < y_crop; ++y) {
+                    for (int x = 0; x < x_crop; ++x) {
+                        int linear_id = x + x_in * (y + y_in * (z + z2_off + z_in * (f + f_in * b)));
+                        int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b)));
+                        ASSERT_EQ(output_ptr2[output_linear_id], input_vec[linear_id]);
+                    }
+                }
+            }
+        }
+    }
+}
+
 TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_allowed)
 {
     auto& engine = get_test_engine();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index 260a1c444284cb..eb13bc8b5bd1d9 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -5,14 +5,208 @@
 
 #include "llm_infer_request.hpp"
 #include "logging.hpp"
+#include "openvino/op/ops.hpp"
+#include "openvino/openvino.hpp"
+#include "openvino/opsets/opset13.hpp"
+#include "openvino/pass/graph_rewrite.hpp"
+#include "openvino/pass/matcher_pass.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "openvino/pass/stateful_to_stateless.hpp"
+#include "openvino/pass/validate.hpp"
 #include "openvino/runtime/iasync_infer_request.hpp"
 
+namespace opp = ov::pass::pattern;
+class TransposeValueTensors : public ov::pass::MatcherPass {
+public:
+    struct Context {
+        std::vector<std::shared_ptr<ov::opset13::Parameter>> new_params;
+        std::vector<std::shared_ptr<ov::opset13::Parameter>> old_params;
+        using Ref = std::reference_wrapper<Context>;
+    };
+
+    OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::TransposeValueTensors");
+    TransposeValueTensors(Context::Ref ctx) {
+        auto param = opp::wrap_type<ov::op::v0::Parameter>();
+        auto transpose = opp::wrap_type<ov::op::v1::Transpose>({opp::any_input(), opp::any_input()});
+        auto concat = opp::wrap_type<ov::op::v0::Concat>({param, transpose});
+        auto softmax = opp::wrap_type<ov::op::v8::Softmax>({opp::any_input()});
+        auto matmul = opp::wrap_type<ov::op::v0::MatMul>({softmax, concat});
+
+        auto callback = [=](ov::pass::pattern::Matcher& m) {
+            auto& node_to_output = m.get_pattern_value_map();
+
+            auto matched_node_param = node_to_output.at(param).get_node_shared_ptr();
+            auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr();
+            auto matched_node_transpose = node_to_output.at(transpose).get_node_shared_ptr();
+            auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr();
+
+            auto matched_param = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_param);
+            auto matched_concat = std::static_pointer_cast<ov::op::v0::Concat>(matched_node_concat);
+            auto matched_transpose = std::static_pointer_cast<ov::op::v1::Transpose>(matched_node_transpose);
+            auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
+
+            auto shape = matched_param->get_partial_shape();
+            OPENVINO_ASSERT(shape.size() == 4u);
+            // NB: Transpose Parameter that correspond to V-tensor it will
+            // speed-up its multiplication with attention scores
+            std::swap(shape[2], shape[3]);
+            auto new_param = std::make_shared<ov::opset13::Parameter>(matched_param->get_element_type(), shape);
+            new_param->set_friendly_name(matched_param->get_friendly_name());
+            new_param->outputs().begin()->get_tensor().set_names(
+                matched_param->outputs().begin()->get_tensor().get_names());
+            ov::replace_node(matched_param, new_param);
+            // NB: Save in order to add/remove to the model later on
+            ctx.get().new_params.push_back(new_param);
+            ctx.get().old_params.push_back(matched_param);
+
+            auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1});
+            auto new_transpose =
+                std::make_shared<ov::opset13::Transpose>(matched_transpose->input_value(0), order_cst->output(0));
+            new_transpose->set_friendly_name(matched_transpose->get_friendly_name());
+            ov::replace_node(matched_transpose, new_transpose);
+
+            auto new_concat =
+                std::make_shared<ov::opset13::Concat>(ov::OutputVector{new_param->output(0), new_transpose->output(0)},
+                                                      3u);
+            new_concat->set_friendly_name(matched_concat->get_friendly_name());
+            ov::replace_node(matched_concat, new_concat);
+
+            matched_matmul->set_transpose_b(true);
+
+            return true;
+        };
+        register_matcher(std::make_shared<opp::Matcher>(matmul, "TransposeValueTensors"), std::move(callback));
+    }
+};
+
+class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::ScaledDotProductAttentionDecomposition");
+    ScaledDotProductAttentionDecomposition() {
+        auto pattern_node = ov::pass::pattern::wrap_type<ov::op::v13::ScaledDotProductAttention>();
+
+        ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+            auto& pattern_to_output = m.get_pattern_value_map();
+            auto node = ov::as_type_ptr<ov::op::v13::ScaledDotProductAttention>(
+                pattern_to_output.at(pattern_node).get_node_shared_ptr());
+
+            if (node == nullptr || transformation_callback(node)) {
+                return false;
+            }
+
+            auto new_output_node = decompose(node);
+            ov::replace_node(node, new_output_node);
+            return true;
+        };
+
+        auto m = std::make_shared<ov::pass::pattern::Matcher>(pattern_node, "ScaledDotProductAttentionDecomposition");
+        register_matcher(m, std::move(callback));
+    }
+    std::shared_ptr<ov::Node> decompose(std::shared_ptr<ov::op::v13::ScaledDotProductAttention> node) {
+        using namespace ov::op;
+        using namespace ov;
+        auto query = node->input_value(0);
+        auto key = node->input_value(1);
+        auto value = node->input_value(2);
+        auto q_shape = register_new_node<v3::ShapeOf>(query, element::i32);
+        auto k_shape = register_new_node<v3::ShapeOf>(key, element::i32);
+        auto minus_one = register_new_node(v0::Constant::create(element::i32, Shape{}, {-1}));
+        auto minus_two = register_new_node(v0::Constant::create(element::i32, Shape{}, {-2}));
+        auto zero_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {0}));
+        auto one_i = register_new_node(v0::Constant::create(element::i32, Shape{}, {1}));
+        auto one_f = register_new_node<v1::ConvertLike>(one_i, query);
+        auto zero_f = register_new_node<v1::ConvertLike>(zero_i, query);
+
+        Output<Node> scale;
+        if (node->get_input_size() < 5) {
+            scale = register_new_node<v8::Gather>(q_shape, minus_one, zero_i)->output(0);
+            scale = register_new_node<v1::ConvertLike>(scale, query);
+            auto sqrt_scale = register_new_node<v0::Sqrt>(scale);
+            scale = register_new_node<v1::Divide>(one_f, sqrt_scale);
+        } else {
+            scale = node->input_value(4);
+        }
+
+        auto q_scaled = register_new_node<v1::Multiply>(query, scale);
+        auto k_rank = register_new_node<v3::ShapeOf>(k_shape, element::i32)->output(0);
+        auto k_last_dim = register_new_node<v1::Add>(k_rank, minus_one);
+        auto k_next_dim = register_new_node<v1::Add>(k_rank, minus_two)->output(0);
+        k_rank = register_new_node<v0::Squeeze>(k_rank, zero_i);
+        auto minus_inf =
+            register_new_node(v0::Constant::create(element::f32, Shape{}, {-std::numeric_limits<float>::infinity()}))
+                ->output(0);
+        auto keep_dim_last = register_new_node<v0::Squeeze>(k_next_dim, zero_i);
+        auto k_dims_before_transpose = register_new_node<v4::Range>(zero_i, keep_dim_last, one_i, element::i32);
+
+        auto scaled_atten = register_new_node<v0::MatMul>(q_scaled, key, false, true)->output(0);
+        minus_inf = register_new_node<v1::ConvertLike>(minus_inf, scaled_atten);
+
+        if (node->get_causal() || node->get_input_size() > 3) {
+            Output<Node> mask;
+            Output<Node> atten_mask;
+            if (!node->get_causal()) {
+                mask = node->input_value(3);
+
+                // two types of masks are supported. A boolean mask where a value of True indicates that the element
+                // should take part in attention. A float mask of the same type as query, key, value that is added to
+                // the attention score.
+                if (mask.get_element_type() == element::boolean) {
+                    atten_mask = register_new_node<v1::ConvertLike>(mask, scaled_atten);
+                    auto inv_mask = register_new_node<v1::LogicalNot>(mask);
+                    atten_mask = register_new_node<v1::Select>(inv_mask, atten_mask, minus_inf);
+                } else {
+                    atten_mask = mask;
+                }
+            } else {
+                auto target_s_len = register_new_node<v8::Gather>(q_shape, minus_two, zero_i);
+                auto source_s_len = register_new_node<v8::Gather>(k_shape, minus_two, zero_i);
+                auto ssl = register_new_node<v0::Unsqueeze>(source_s_len, zero_i);
+                auto tsl = register_new_node<v0::Unsqueeze>(target_s_len, zero_i);
+                auto mask_shape = register_new_node<v0::Concat>(OutputVector{tsl, ssl}, 0);
+                mask = register_new_node<v1::Broadcast>(minus_inf, mask_shape);
+                auto horizontal_range =
+                    register_new_node<v4::Range>(zero_i, source_s_len, one_i, element::i32)->output(0);
+                horizontal_range = register_new_node<v0::Unsqueeze>(horizontal_range, zero_i);
+                auto stop = register_new_node<v1::Add>(target_s_len, one_i);
+                auto vertical_range = register_new_node<v4::Range>(one_i, stop, one_i, element::i32)->output(0);
+                vertical_range = register_new_node<v0::Unsqueeze>(vertical_range, one_i);
+                auto triu = register_new_node<v1::GreaterEqual>(horizontal_range, vertical_range);
+                atten_mask = register_new_node<v1::Select>(triu, mask, zero_f);
+            }
+            scaled_atten = register_new_node<v1::Add>(scaled_atten, atten_mask);
+        }
+
+        scaled_atten = register_new_node<v8::Softmax>(scaled_atten, -1);
+        auto result = register_new_node<v0::MatMul>(scaled_atten, value);
+        result->set_friendly_name(node->get_friendly_name());
+        copy_runtime_info(node, get_new_nodes());
+        return result;
+    }
+};
+
 namespace {
 uint32_t align_to(uint32_t value, uint32_t alignment) {
     return (value + alignment - 1) & ~(alignment - 1);
 }
 
+std::shared_ptr<ov::Model> cvt_kvcache_to_fp16(const std::shared_ptr<ov::Model>& model) {
+    ov::preprocess::PrePostProcessor ppp(model);
+
+    for (const auto& tensor : model->inputs()) {
+        if (tensor.get_any_name().find("past_key") != std::string::npos) {
+            ppp.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16);
+        }
+    }
+
+    for (const auto& tensor : model->outputs()) {
+        if (tensor.get_any_name().find("present") != std::string::npos) {
+            ppp.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16);
+        }
+    }
+
+    return ppp.build();
+}
+
 std::shared_ptr<ov::Model> redirect_new_kv_to_output(const std::shared_ptr<ov::Model>& model) {
     const auto kStartOutputKVCacheLayers = 1u;
     for (std::size_t i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) {
@@ -27,22 +221,33 @@ std::shared_ptr<ov::Model> redirect_new_kv_to_output(const std::shared_ptr<ov::M
     return model;
 }
 
-std::shared_ptr<ov::Model> cvt_kvcache_to_fp16(const std::shared_ptr<ov::Model>& model) {
+std::shared_ptr<ov::Model> cvt_value_tensors_layout(std::shared_ptr<ov::Model> model) {
     ov::preprocess::PrePostProcessor ppp(model);
-
-    for (const auto& tensor : model->inputs()) {
-        if (tensor.get_any_name().find("past_key") != std::string::npos) {
-            ppp.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16);
+    for (auto tensor : model->outputs()) {
+        if (tensor.get_any_name().find("value") != std::string::npos) {
+            // NB: [batch, num_heads, seq_len, emb_size] -> [batch, num_heads, emb_size, seq_len]
+            ppp.output(tensor.get_any_name()).model().set_layout(ov::Layout("BHSE"));
+            ppp.output(tensor.get_any_name()).tensor().set_layout(ov::Layout("BHES"));
         }
     }
+    return ppp.build();
+}
 
-    for (const auto& tensor : model->outputs()) {
-        if (tensor.get_any_name().find("present") != std::string::npos) {
-            ppp.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16);
-        }
+bool optimize_value_tensors(std::shared_ptr<ov::Model> model) {
+    ov::pass::GraphRewrite rewr;
+    rewr.add_matcher<ScaledDotProductAttentionDecomposition>();
+    TransposeValueTensors::Context ctx;
+    rewr.add_matcher<TransposeValueTensors>(std::ref(ctx));
+    rewr.run_on_model(model);
+
+    model->add_parameters(ctx.new_params);
+    for (auto old_param : ctx.old_params) {
+        model->remove_parameter(old_param);
     }
+    ov::pass::Validate().run_on_model(model);
 
-    return ppp.build();
+    // NB: if new_params is not empty - pass has been applied
+    return !ctx.new_params.empty();
 }
 
 struct KVAxesPosition {
@@ -116,32 +321,6 @@ std::optional<NPUDesc> extract_npu_descriptor(const std::shared_ptr<const ov::IP
     return std::make_optional(NPUDesc{arch.as<std::string>(), max_tiles.as<int64_t>()});
 }
 
-std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_name) {
-    if (auto it = config.find(option_name); it != config.end()) {
-        std::optional<ov::Any> found = std::make_optional(it->second);
-        config.erase(it);
-        return found;
-    }
-    return std::nullopt;
-}
-
-template <typename T>
-std::optional<T> get_option(ov::AnyMap& config, const std::string& option_name) {
-    if (auto it = config.find(option_name); it != config.end()) {
-        return std::make_optional(it->second.as<T>());
-    }
-    return std::nullopt;
-}
-
-template <typename T>
-T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) {
-    auto anyopt = pop_option(config, key);
-    if (anyopt.has_value()) {
-        return anyopt.value().as<T>();
-    }
-    return default_value;
-}
-
 ov::AnyMap get_baseline_common_config() {
     ov::AnyMap config = {
         {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"},
@@ -206,12 +385,6 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) {
     }
 }
 
-void drop_cache_dir(ov::AnyMap& config) {
-    if (config.count("NPU_USE_NPUW") != 0u) {
-        pop_option(config, "CACHE_DIR");
-    }
-}
-
 void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) {
     for (auto it = properties.begin(); it != properties.end(); ++it) {
         if (it->first.find("NPUW_LLM") != it->first.npos) {
@@ -251,41 +424,48 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     auto kvcache_model = model->clone();
     LOG_DEBUG("2. Transform kvcache model from stateful to stateless.");
     ov::pass::StatefulToStateless().run_on_model(kvcache_model);
-
     LOG_DEBUG("3. Creating prefill model as clone of transformed kvcache one.");
     auto prefill_model = kvcache_model->clone();
     prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");
-    LOG_DEBUG("4. Converting KV-cache in prefill model to FP16.");
-    prefill_model = cvt_kvcache_to_fp16(prefill_model);
-
-    LOG_DEBUG("5. Optimize kvcache kvcache model to output key/values for new token.");
-    kvcache_model = redirect_new_kv_to_output(kvcache_model);
-    LOG_DEBUG("6. Converting KV-cache in kvcache model to FP16.");
-    kvcache_model = cvt_kvcache_to_fp16(kvcache_model);
 
+    const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>();
     const uint32_t kMaxPromptLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
     const uint32_t kMinResponseLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);
-    const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>();
     KVAxesPosition axes = get_kv_axes(model_desc.type);
     m_kvcache_desc = KVCacheDesc{kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len};
-    LOG_DEBUG("7. Make prefill model with static shapes");
+    LOG_DEBUG("4. Make prefill model with static shapes");
     reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
-    LOG_DEBUG("8. Make kvcache model with static shapes");
+    LOG_DEBUG("5. Make kvcache model with static shapes");
     reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes);
+    LOG_DEBUG("6.Check and apply opt layout if applicable.");
+    // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
+    if (model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" ||
+        (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) {
+        if (optimize_value_tensors(kvcache_model)) {
+            // NB: Check if TransposeValueTensors transformation was applied
+            m_kvcache_desc.v_tensors_transposed = true;
+            prefill_model = cvt_value_tensors_layout(prefill_model);
+        }
+    }
+    LOG_DEBUG("7. Optimize kvcache model to output key/values for new token.");
+    kvcache_model = redirect_new_kv_to_output(kvcache_model);
+    LOG_DEBUG("8. Converting KV-cache in kvcache model to FP16.");
+    kvcache_model = cvt_kvcache_to_fp16(kvcache_model);
+    LOG_DEBUG("9. Converting KV-cache in prefill model to FP16.");
+    prefill_model = cvt_kvcache_to_fp16(prefill_model);
 
     auto npudesc = extract_npu_descriptor(plugin);
-
-    ov::AnyMap properties_copy = std::move(other_props);
+    ov::AnyMap properties_copy = other_props;
     auto prefill_config = get_default_prefill_config(model, npudesc);
+
     // NB: GENERATE_HINT is only applicable for default generate config!
     const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>();
-    LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
+    LOG_DEBUG(
+        "10. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
     auto generate_config = get_default_generate_config(model, npudesc, generate_hint);
+
     merge_config_with(prefill_config, properties_copy);
     merge_config_with(generate_config, properties_copy);
-    // FIXME: Drop CACHE_DIR option if NPUW is enabled
-    drop_cache_dir(prefill_config);
-    drop_cache_dir(generate_config);
 
     m_kvcache_compiled = std::make_shared<ov::npuw::CompiledModel>(kvcache_model, plugin, generate_config);
     m_prefill_compiled = std::make_shared<ov::npuw::CompiledModel>(prefill_model, plugin, prefill_config);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
index 1a748997fd48fa..e37a47b2c77948 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
@@ -22,6 +22,7 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
         uint32_t total_size = 0u;
         uint32_t num_stored_tokens = 0u;
         uint32_t dim = 0u;
+        bool v_tensors_transposed = false;
     };
 
     LLMCompiledModel(const std::shared_ptr<ov::Model>& model,
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index a73478c0cab5d2..12f103cc0ab6a2 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -27,6 +27,36 @@ ov::SoPtr<ov::ITensor> make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
     end_shape[dim] = end_pos;
     return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape));
 }
+
+void copy_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst) {
+    const auto src_shape = src->get_shape();
+
+    OPENVINO_ASSERT(src_shape.size() == 4u);
+    OPENVINO_ASSERT(src_shape == dst->get_shape());
+    OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size());
+
+    const auto src_strides = src->get_strides();
+    const auto dst_strides = dst->get_strides();
+    const auto elem_size = src->get_byte_size() / src->get_size();
+
+    const auto C = src_shape[1];
+    const auto H = src_shape[2];
+    const auto W = src_shape[3];
+
+    const auto IS_H = src_strides[2];
+    const auto OS_H = dst_strides[2];
+
+    const size_t chunk_byte_size = W * elem_size;
+
+    const auto* src_p = static_cast<uint8_t*>(src->data());
+    auto* dst_p = static_cast<uint8_t*>(dst->data());
+
+    for (size_t i = 0; i < C * H; ++i) {
+        const size_t src_offset = i * IS_H;
+        const size_t dst_offset = i * OS_H;
+        std::copy_n(src_p + src_offset, chunk_byte_size, dst_p + dst_offset);
+    }
+}
 }  // anonymous namespace
 
 ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
@@ -116,17 +146,25 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
             //        taking into account kvcache dimension.
             fill_tensor<ov::float16>(kvcache_in_tensor, 0);
 
+            const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed)
+                                     ? 3u
+                                     : m_kvcache_desc.dim;
+
             auto prefill_out_slice =
                 make_tensor_slice(prefill_out_tensor,
-                                  m_kvcache_desc.dim,
+                                  kv_dim,
                                   m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens,
                                   m_kvcache_desc.max_prompt_size);
 
-            auto kvcache_in_slice =
-                make_tensor_slice(kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens);
+            auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, m_kvcache_desc.num_stored_tokens);
 
-            prefill_out_slice->copy_to(kvcache_in_slice._ptr);
+            if (kv_dim == 3u) {
+                copy_columns_by_row_chunks(prefill_out_slice, kvcache_in_slice);
+            } else {
+                prefill_out_slice->copy_to(kvcache_in_slice._ptr);
+            }
         }
+
         LOG_DEBUG("Prepare attention mask pattern.");
         auto* attention_mask_data =
             m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data<int64_t>();
@@ -156,8 +194,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name();
         const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values");
         auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name));
+        const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed)
+                                 ? 3u
+                                 : m_kvcache_desc.dim;
         auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor,
-                                                  m_kvcache_desc.dim,
+                                                  kv_dim,
                                                   m_kvcache_desc.num_stored_tokens - 1,
                                                   m_kvcache_desc.num_stored_tokens);
         auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name));
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
index 5abe4b39fd44f2..0260fc9718c444 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
@@ -160,7 +160,8 @@ DQMatMulCWi::DQMatMulCWi(Context::Ref ctx) {
         auto qcoeff_shape = matched_node_qcoeff->output(0).get_shape();
 
         if ((ov::element::i4 == matched_qweight->get_element_type() ||
-             ov::element::i8 == matched_qweight->get_element_type()) &&
+             ov::element::i8 == matched_qweight->get_element_type() ||
+             ov::element::nf4 == matched_qweight->get_element_type()) &&
             (ov::op::util::is_parameter(matched_node_qcoeff) || ov::op::util::is_constant(matched_node_qcoeff)) &&
             qcoeff_shape[1] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) {
             auto matched_node_cvtw = node_to_output.at(qcvtw).get_node_shared_ptr();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 08b4308479ef03..de3ad80280d603 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -7,5 +7,5 @@ add_subdirectory(model_hub_tests)
 add_subdirectory(samples_tests)
 add_subdirectory(e2e_tests)
 
-install(FILES requirements_pytorch requirements_tensorflow requirements_onnx
+install(FILES requirements_pytorch requirements_tensorflow requirements_onnx requirements_jax
         DESTINATION tests COMPONENT tests EXCLUDE_FROM_ALL)
diff --git a/tests/constraints.txt b/tests/constraints.txt
index 4f46cd0cc8b2e9..c339ac3c65d56f 100644
--- a/tests/constraints.txt
+++ b/tests/constraints.txt
@@ -21,11 +21,8 @@ pytest>=5.0,<8.4
 pytest-dependency==0.5.1
 pytest-html==4.1.1
 pytest-timeout==2.3.1
-jax<=0.4.36
-jaxlib<=0.4.36
 kornia==0.7.0
 networkx<=3.3
-flax<=0.10.2
 
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch~=2.5.1; platform_system != "Darwin" or platform_machine != "x86_64"
diff --git a/tests/e2e_tests/requirements.txt b/tests/e2e_tests/requirements.txt
index 29e1c1cf31c558..a2056071e5417e 100644
--- a/tests/e2e_tests/requirements.txt
+++ b/tests/e2e_tests/requirements.txt
@@ -9,7 +9,7 @@ scipy>=1.5.4,<1.15
 opencv-python>=4.5; sys_platform != "darwin"
 opencv-python==4.8.1.78; sys_platform == "darwin"
 unittest-xml-reporting==3.0.4
-lpips==0.1.3
+lpips==0.1.4
 
 # for utils/e2e/comparator note: python 3.6 wheels is not available since 0.18
 # Add upper-bound due CVS-105039, CVS-105040
diff --git a/tests/layer_tests/onnx_tests/test_abs.py b/tests/layer_tests/onnx_tests/test_abs.py
index 9a82929ea35547..71e509faef3e65 100644
--- a/tests/layer_tests/onnx_tests/test_abs.py
+++ b/tests/layer_tests/onnx_tests/test_abs.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_and.py b/tests/layer_tests/onnx_tests/test_and.py
index ca5d21a42fe067..195ace1dadfa14 100644
--- a/tests/layer_tests/onnx_tests/test_and.py
+++ b/tests/layer_tests/onnx_tests/test_and.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_argmax.py b/tests/layer_tests/onnx_tests/test_argmax.py
index 604df5e7e69875..80d7568e9e8c4c 100644
--- a/tests/layer_tests/onnx_tests/test_argmax.py
+++ b/tests/layer_tests/onnx_tests/test_argmax.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_ceil.py b/tests/layer_tests/onnx_tests/test_ceil.py
index b7558630ac1c63..ea7ea10abbd31d 100644
--- a/tests/layer_tests/onnx_tests/test_ceil.py
+++ b/tests/layer_tests/onnx_tests/test_ceil.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_clip.py b/tests/layer_tests/onnx_tests/test_clip.py
index dbce45193034d9..3cb3ba250a12e0 100644
--- a/tests/layer_tests/onnx_tests/test_clip.py
+++ b/tests/layer_tests/onnx_tests/test_clip.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_concat.py b/tests/layer_tests/onnx_tests/test_concat.py
index 8627f3b198dbd3..602b6a69644527 100644
--- a/tests/layer_tests/onnx_tests/test_concat.py
+++ b/tests/layer_tests/onnx_tests/test_concat.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_conv.py b/tests/layer_tests/onnx_tests/test_conv.py
index b7f9729141c33e..202d6af2915c67 100644
--- a/tests/layer_tests/onnx_tests/test_conv.py
+++ b/tests/layer_tests/onnx_tests/test_conv.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_cumsum.py b/tests/layer_tests/onnx_tests/test_cumsum.py
index 1e197de490d518..486b1f50835fb0 100644
--- a/tests/layer_tests/onnx_tests/test_cumsum.py
+++ b/tests/layer_tests/onnx_tests/test_cumsum.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_dequantize_linear.py b/tests/layer_tests/onnx_tests/test_dequantize_linear.py
index 9090f3a829919b..319030590a3f0d 100644
--- a/tests/layer_tests/onnx_tests/test_dequantize_linear.py
+++ b/tests/layer_tests/onnx_tests/test_dequantize_linear.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_elu.py b/tests/layer_tests/onnx_tests/test_elu.py
index dbffc32d09c6c7..9f0321ec9a6ee3 100644
--- a/tests/layer_tests/onnx_tests/test_elu.py
+++ b/tests/layer_tests/onnx_tests/test_elu.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_embedding_bag.py b/tests/layer_tests/onnx_tests/test_embedding_bag.py
index a18a59b9752f16..54d940c01fb36c 100644
--- a/tests/layer_tests/onnx_tests/test_embedding_bag.py
+++ b/tests/layer_tests/onnx_tests/test_embedding_bag.py
@@ -5,6 +5,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 import torch
 import torch.nn as nn
 from common.layer_test_class import CommonLayerTest, check_ir_version
diff --git a/tests/layer_tests/onnx_tests/test_floor.py b/tests/layer_tests/onnx_tests/test_floor.py
index 87ad058c510e8c..5076befc414941 100644
--- a/tests/layer_tests/onnx_tests/test_floor.py
+++ b/tests/layer_tests/onnx_tests/test_floor.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_gather.py b/tests/layer_tests/onnx_tests/test_gather.py
index a45d5b4f4a916b..9380de31c6dccc 100644
--- a/tests/layer_tests/onnx_tests/test_gather.py
+++ b/tests/layer_tests/onnx_tests/test_gather.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_hard_sigmoid.py b/tests/layer_tests/onnx_tests/test_hard_sigmoid.py
index 12986c590d41d4..a62ab2a7fc54e8 100644
--- a/tests/layer_tests/onnx_tests/test_hard_sigmoid.py
+++ b/tests/layer_tests/onnx_tests/test_hard_sigmoid.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_identity.py b/tests/layer_tests/onnx_tests/test_identity.py
index a86c0e2a687257..e58e272de49ec0 100644
--- a/tests/layer_tests/onnx_tests/test_identity.py
+++ b/tests/layer_tests/onnx_tests/test_identity.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_leaky_relu.py b/tests/layer_tests/onnx_tests/test_leaky_relu.py
index 3a12bfcd92c33e..cff9cd87b59d30 100644
--- a/tests/layer_tests/onnx_tests/test_leaky_relu.py
+++ b/tests/layer_tests/onnx_tests/test_leaky_relu.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_log.py b/tests/layer_tests/onnx_tests/test_log.py
index db0a329aa09746..53e2c42505bf7b 100644
--- a/tests/layer_tests/onnx_tests/test_log.py
+++ b/tests/layer_tests/onnx_tests/test_log.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_logsoftmax.py b/tests/layer_tests/onnx_tests/test_logsoftmax.py
index a81b20402d50dd..057376d6ed48b2 100644
--- a/tests/layer_tests/onnx_tests/test_logsoftmax.py
+++ b/tests/layer_tests/onnx_tests/test_logsoftmax.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_lrn.py b/tests/layer_tests/onnx_tests/test_lrn.py
index 0e8f34129a300f..1c1cf62d5d12b4 100644
--- a/tests/layer_tests/onnx_tests/test_lrn.py
+++ b/tests/layer_tests/onnx_tests/test_lrn.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_neg.py b/tests/layer_tests/onnx_tests/test_neg.py
index d19991cb8a6b12..98f6acd728f637 100644
--- a/tests/layer_tests/onnx_tests/test_neg.py
+++ b/tests/layer_tests/onnx_tests/test_neg.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_non_zero.py b/tests/layer_tests/onnx_tests/test_non_zero.py
index 464304651a2a19..a2035b4ab27d63 100644
--- a/tests/layer_tests/onnx_tests/test_non_zero.py
+++ b/tests/layer_tests/onnx_tests/test_non_zero.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_not.py b/tests/layer_tests/onnx_tests/test_not.py
index 05a6c7ffbb2e2d..1caf8e2e7a770c 100644
--- a/tests/layer_tests/onnx_tests/test_not.py
+++ b/tests/layer_tests/onnx_tests/test_not.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_or.py b/tests/layer_tests/onnx_tests/test_or.py
index 285c90765d6a7e..6db35aff2f500e 100644
--- a/tests/layer_tests/onnx_tests/test_or.py
+++ b/tests/layer_tests/onnx_tests/test_or.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_pad.py b/tests/layer_tests/onnx_tests/test_pad.py
index abacc530d93144..161db0685b6fa8 100644
--- a/tests/layer_tests/onnx_tests/test_pad.py
+++ b/tests/layer_tests/onnx_tests/test_pad.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_pooling.py b/tests/layer_tests/onnx_tests/test_pooling.py
index 85e7fc883fc5d8..2bc2251f8aea49 100644
--- a/tests/layer_tests/onnx_tests/test_pooling.py
+++ b/tests/layer_tests/onnx_tests/test_pooling.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_prelu.py b/tests/layer_tests/onnx_tests/test_prelu.py
index f20e89b7006a44..59a1e8f4f415e1 100644
--- a/tests/layer_tests/onnx_tests/test_prelu.py
+++ b/tests/layer_tests/onnx_tests/test_prelu.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_reduce.py b/tests/layer_tests/onnx_tests/test_reduce.py
index 58141e18260016..46b4008c4e653d 100644
--- a/tests/layer_tests/onnx_tests/test_reduce.py
+++ b/tests/layer_tests/onnx_tests/test_reduce.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_reduce_lp.py b/tests/layer_tests/onnx_tests/test_reduce_lp.py
index 2ff4511ef87443..3cf2f5e133b895 100644
--- a/tests/layer_tests/onnx_tests/test_reduce_lp.py
+++ b/tests/layer_tests/onnx_tests/test_reduce_lp.py
@@ -5,6 +5,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_relu.py b/tests/layer_tests/onnx_tests/test_relu.py
index ce597920923289..520749ed948b25 100644
--- a/tests/layer_tests/onnx_tests/test_relu.py
+++ b/tests/layer_tests/onnx_tests/test_relu.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_reshape.py b/tests/layer_tests/onnx_tests/test_reshape.py
index 637beeb4388bbb..28eb339af52f9e 100644
--- a/tests/layer_tests/onnx_tests/test_reshape.py
+++ b/tests/layer_tests/onnx_tests/test_reshape.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_resize.py b/tests/layer_tests/onnx_tests/test_resize.py
index 4d28afdb50fe38..36a808fa859ef1 100644
--- a/tests/layer_tests/onnx_tests/test_resize.py
+++ b/tests/layer_tests/onnx_tests/test_resize.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_roi_align.py b/tests/layer_tests/onnx_tests/test_roi_align.py
index 4cd49c50c20bf8..d5cedf4e1a0f06 100644
--- a/tests/layer_tests/onnx_tests/test_roi_align.py
+++ b/tests/layer_tests/onnx_tests/test_roi_align.py
@@ -5,6 +5,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 from unit_tests.utils.graph import build_graph
diff --git a/tests/layer_tests/onnx_tests/test_scatter.py b/tests/layer_tests/onnx_tests/test_scatter.py
index 578300e144bc3d..baaa0392553fbf 100644
--- a/tests/layer_tests/onnx_tests/test_scatter.py
+++ b/tests/layer_tests/onnx_tests/test_scatter.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_sigmoid.py b/tests/layer_tests/onnx_tests/test_sigmoid.py
index 5dcb3e8f1b112a..db055a6d9030ac 100644
--- a/tests/layer_tests/onnx_tests/test_sigmoid.py
+++ b/tests/layer_tests/onnx_tests/test_sigmoid.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_sign.py b/tests/layer_tests/onnx_tests/test_sign.py
index 07f4f169a7bc1b..70c0ffcc0033ec 100644
--- a/tests/layer_tests/onnx_tests/test_sign.py
+++ b/tests/layer_tests/onnx_tests/test_sign.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_softmax.py b/tests/layer_tests/onnx_tests/test_softmax.py
index c4d9d600276402..390b1a894549c3 100644
--- a/tests/layer_tests/onnx_tests/test_softmax.py
+++ b/tests/layer_tests/onnx_tests/test_softmax.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_softplus.py b/tests/layer_tests/onnx_tests/test_softplus.py
index cdcbbbf3e8ed13..b0127c0dcf0624 100644
--- a/tests/layer_tests/onnx_tests/test_softplus.py
+++ b/tests/layer_tests/onnx_tests/test_softplus.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_softsign.py b/tests/layer_tests/onnx_tests/test_softsign.py
index 30ca27402c7878..75043b57b80dc7 100644
--- a/tests/layer_tests/onnx_tests/test_softsign.py
+++ b/tests/layer_tests/onnx_tests/test_softsign.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_sqrt.py b/tests/layer_tests/onnx_tests/test_sqrt.py
index 9c4733a68cd9fa..24dbbcac659df4 100644
--- a/tests/layer_tests/onnx_tests/test_sqrt.py
+++ b/tests/layer_tests/onnx_tests/test_sqrt.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_trigonometry.py b/tests/layer_tests/onnx_tests/test_trigonometry.py
index 563b63b1e5632d..99651091ea2e96 100644
--- a/tests/layer_tests/onnx_tests/test_trigonometry.py
+++ b/tests/layer_tests/onnx_tests/test_trigonometry.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_where.py b/tests/layer_tests/onnx_tests/test_where.py
index fb358a2ced8415..1bf845340b3922 100644
--- a/tests/layer_tests/onnx_tests/test_where.py
+++ b/tests/layer_tests/onnx_tests/test_where.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/onnx_tests/test_xor.py b/tests/layer_tests/onnx_tests/test_xor.py
index 2790a31784ff59..e7f0c11f8362a2 100644
--- a/tests/layer_tests/onnx_tests/test_xor.py
+++ b/tests/layer_tests/onnx_tests/test_xor.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+pytest.importorskip("openvino.tools.mo", reason="Ticket - 157136")
+
 from common.layer_test_class import check_ir_version
 from common.onnx_layer_test_class import OnnxRuntimeLayerTest, onnx_make_model
 
diff --git a/tests/layer_tests/requirements.txt b/tests/layer_tests/requirements.txt
index 04889ebce10a39..2ba12cc5e2bece 100644
--- a/tests/layer_tests/requirements.txt
+++ b/tests/layer_tests/requirements.txt
@@ -16,5 +16,3 @@ pytest
 defusedxml
 tensorflow
 tensorflow-addons; python_version <= '3.10'
-jax; sys_platform == "linux" and platform_machine == "x86_64" # https://jax.readthedocs.io/en/latest/installation.html#pip-installation-cpu - wheels are for "x86_64" only
-jaxlib; sys_platform == "linux" and platform_machine == "x86_64" # https://jax.readthedocs.io/en/latest/installation.html#pip-installation-cpu - wheels are for "x86_64" only
diff --git a/tests/layer_tests/tensorflow_tests/test_tf_UnaryOpsAllRealDomain.py b/tests/layer_tests/tensorflow_tests/test_tf_UnaryOpsAllRealDomain.py
index 4ff4d589cbae32..5c1037e38cfc84 100644
--- a/tests/layer_tests/tensorflow_tests/test_tf_UnaryOpsAllRealDomain.py
+++ b/tests/layer_tests/tensorflow_tests/test_tf_UnaryOpsAllRealDomain.py
@@ -67,4 +67,4 @@ def test_unary_ops(self, input_shape, input_type, op_type,
             pytest.skip("159585: accuracy error on ARM")
         self._test(*self.create_unary_net(input_shape, input_type, op_type),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
-                   use_legacy_frontend=use_legacy_frontend, custom_eps=1e-3)
+                   use_legacy_frontend=use_legacy_frontend, custom_eps=3 * 1e-3)
diff --git a/tests/model_hub_tests/jax/requirements.txt b/tests/model_hub_tests/jax/requirements.txt
deleted file mode 100644
index 328084ac050ca6..00000000000000
--- a/tests/model_hub_tests/jax/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
--c ../../constraints.txt
-numpy
-pytest
-pytest-html
-transformers
-requests
-jax
-jaxlib
-flax
-pillow
\ No newline at end of file
diff --git a/tests/requirements_jax b/tests/requirements_jax
new file mode 100644
index 00000000000000..c392df4359bee3
--- /dev/null
+++ b/tests/requirements_jax
@@ -0,0 +1,13 @@
+numpy==1.26.4; python_version < "3.12" or platform_system == "Darwin" and platform_machine == "x86_64"
+numpy==2.2.1; python_version >= "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64")
+pytest==7.0.1
+pytest-xdist[psutil]==3.6.1
+pytest-html==4.1.1
+jax==0.4.38; (platform_system != "Darwin" or platform_machine != "x86_64") and python_version > "3.9"
+# tensorflow 2.16.2 depends on ml-dtypes~=0.3.1 and jax 0.4.35 depends on ml-dtypes>=0.4.0
+jax==0.4.33; (platform_system == "Darwin" and platform_machine == "x86_64") and python_version > "3.9"
+jax==0.4.30; python_version <= "3.9"
+flax==0.10.2
+transformers==4.47.1
+defusedxml
+pillow
diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch
index f42deb81839883..33907145f7de4b 100644
--- a/tests/requirements_pytorch
+++ b/tests/requirements_pytorch
@@ -14,7 +14,8 @@ torchaudio==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64"
 # transformers 4.45.1 is available
 # but optimum still requires <4.45.0
 transformers==4.44.2
-pytest==7.0.1
+pytest==7.0.1; python_version < '3.10'
+pytest==7.2.0; python_version >= '3.10'
 pytest-html==4.1.1
 pytest-xdist[psutil]==3.6.1
 defusedxml==0.7.1
diff --git a/tests/requirements_tensorflow b/tests/requirements_tensorflow
index 5369b0135f7618..5d699facad1c91 100644
--- a/tests/requirements_tensorflow
+++ b/tests/requirements_tensorflow
@@ -4,7 +4,8 @@
 # tensorflow 2.16.2 depends on numpy<2.0.0 and >=1.26.0; python_version >= "3.12"
 numpy==1.26.4; python_version < "3.12" or platform_system == "Darwin" and platform_machine == "x86_64"
 numpy==2.0.2; python_version >= "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64")
-pytest==7.0.1
+pytest==7.0.1; python_version < '3.10'
+pytest==7.2.0; python_version >= '3.10'
 pytest-xdist[psutil]==3.6.1
 pytest-html==4.1.1
 transformers==4.45.1
@@ -17,7 +18,7 @@ wrapt==1.15.0; python_version >= "3.12"
 # tensorflow-text is not available for both Windows and ARM platforms
 tensorflow-text==2.18.0; python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
 tensorflow-hub==0.16.1
-jax==0.4.35; (platform_system != "Darwin" or platform_machine != "x86_64") and python_version > "3.9"
+jax==0.4.38; (platform_system != "Darwin" or platform_machine != "x86_64") and python_version > "3.9"
 # tensorflow 2.16.2 depends on ml-dtypes~=0.3.1 and jax 0.4.35 depends on ml-dtypes>=0.4.0
 jax==0.4.33; (platform_system == "Darwin" and platform_machine == "x86_64") and python_version > "3.9"
 jax==0.4.30; python_version <= "3.9"
diff --git a/tools/benchmark_tool/openvino/__init__.py b/tools/benchmark_tool/openvino/__init__.py
index 7643f742e0067d..69c678909b1c9e 100644
--- a/tools/benchmark_tool/openvino/__init__.py
+++ b/tools/benchmark_tool/openvino/__init__.py
@@ -7,7 +7,7 @@
 # Required for Windows OS platforms
 # Note: always top-level
 try:
-    from openvino.package_utils import _add_openvino_libs_to_search_path
+    from openvino.utils import _add_openvino_libs_to_search_path
     _add_openvino_libs_to_search_path()
 except ImportError:
     pass
@@ -17,47 +17,6 @@
 # # This __init__.py forces checking of runtime modules to propagate errors.
 # # It is not compared with init files from openvino-dev package.
 # #
-
-# Openvino pybind bindings
-from openvino._pyopenvino import AxisSet
-from openvino._pyopenvino import AxisVector
-from openvino._pyopenvino import ConstOutput
-from openvino._pyopenvino import Coordinate
-from openvino._pyopenvino import CoordinateDiff
-from openvino._pyopenvino import DiscreteTypeInfo
-from openvino._pyopenvino import Extension
-from openvino._pyopenvino import ProfilingInfo
-from openvino._pyopenvino import RTMap
-from openvino._pyopenvino import Version
-from openvino._pyopenvino import Symbol
-from openvino._pyopenvino import Dimension
-from openvino._pyopenvino import Input
-from openvino._pyopenvino import Output
-from openvino._pyopenvino import Node
-from openvino._pyopenvino import Strides
-from openvino._pyopenvino import PartialShape
-from openvino._pyopenvino import Shape
-from openvino._pyopenvino import Layout
-from openvino._pyopenvino import Type
-from openvino._pyopenvino import Tensor
-from openvino._pyopenvino import OVAny
-from openvino._pyopenvino import get_batch
-from openvino._pyopenvino import set_batch
-from openvino._pyopenvino import serialize
-from openvino._pyopenvino import shutdown
-from openvino._pyopenvino import save_model
-from openvino._pyopenvino import layout_helpers
-from openvino._pyopenvino import RemoteContext
-from openvino._pyopenvino import RemoteTensor
-from openvino._pyopenvino import Op
-
-# Import public classes from _ov_api
-from openvino._ov_api import Model
-from openvino._ov_api import Core
-from openvino._ov_api import CompiledModel
-from openvino._ov_api import InferRequest
-from openvino._ov_api import AsyncInferQueue
-
 # Import all public modules
 from openvino import runtime as runtime
 from openvino import frontend as frontend
@@ -67,10 +26,36 @@
 from openvino import utils as utils
 from openvino import properties as properties
 
+# Import most important classes and functions from openvino.runtime
+from openvino._ov_api import Model
+from openvino._ov_api import Core
+from openvino._ov_api import CompiledModel
+from openvino._ov_api import InferRequest
+from openvino._ov_api import AsyncInferQueue
+
+from openvino.runtime import Symbol
+from openvino.runtime import Dimension
+from openvino.runtime import Strides
+from openvino.runtime import PartialShape
+from openvino.runtime import Shape
+from openvino.runtime import Layout
+from openvino.runtime import Type
+from openvino.runtime import Tensor
+from openvino.runtime import OVAny
+
 # Helper functions for openvino module
-from openvino.utils.data_helpers import tensor_from_file
+from openvino.runtime.utils.data_helpers import tensor_from_file
 from openvino._ov_api import compile_model
+from openvino.runtime import get_batch
+from openvino.runtime import set_batch
+from openvino.runtime import serialize
+from openvino.runtime import shutdown
+from openvino.runtime import save_model
+from openvino.runtime import layout_helpers
 
+from openvino._pyopenvino import RemoteContext
+from openvino._pyopenvino import RemoteTensor
+from openvino._pyopenvino import Op
 
 # Import opsets
 from openvino import opset1
@@ -95,7 +80,7 @@
 from openvino._pyopenvino import VASurfaceTensor
 
 # Set version for openvino package
-from openvino._pyopenvino import get_version
+from openvino.runtime import get_version
 __version__ = get_version()
 
 # Tools
diff --git a/tools/mo/openvino/__init__.py b/tools/mo/openvino/__init__.py
index 7643f742e0067d..b015570964c520 100644
--- a/tools/mo/openvino/__init__.py
+++ b/tools/mo/openvino/__init__.py
@@ -7,96 +7,61 @@
 # Required for Windows OS platforms
 # Note: always top-level
 try:
-    from openvino.package_utils import _add_openvino_libs_to_search_path
+    from openvino.utils import _add_openvino_libs_to_search_path
     _add_openvino_libs_to_search_path()
 except ImportError:
     pass
 
-# #
-# # OpenVINO API
-# # This __init__.py forces checking of runtime modules to propagate errors.
-# # It is not compared with init files from openvino-dev package.
-# #
-
-# Openvino pybind bindings
-from openvino._pyopenvino import AxisSet
-from openvino._pyopenvino import AxisVector
-from openvino._pyopenvino import ConstOutput
-from openvino._pyopenvino import Coordinate
-from openvino._pyopenvino import CoordinateDiff
-from openvino._pyopenvino import DiscreteTypeInfo
-from openvino._pyopenvino import Extension
-from openvino._pyopenvino import ProfilingInfo
-from openvino._pyopenvino import RTMap
-from openvino._pyopenvino import Version
-from openvino._pyopenvino import Symbol
-from openvino._pyopenvino import Dimension
-from openvino._pyopenvino import Input
-from openvino._pyopenvino import Output
-from openvino._pyopenvino import Node
-from openvino._pyopenvino import Strides
-from openvino._pyopenvino import PartialShape
-from openvino._pyopenvino import Shape
-from openvino._pyopenvino import Layout
-from openvino._pyopenvino import Type
-from openvino._pyopenvino import Tensor
-from openvino._pyopenvino import OVAny
-from openvino._pyopenvino import get_batch
-from openvino._pyopenvino import set_batch
-from openvino._pyopenvino import serialize
-from openvino._pyopenvino import shutdown
-from openvino._pyopenvino import save_model
-from openvino._pyopenvino import layout_helpers
-from openvino._pyopenvino import RemoteContext
-from openvino._pyopenvino import RemoteTensor
-from openvino._pyopenvino import Op
-
-# Import public classes from _ov_api
-from openvino._ov_api import Model
-from openvino._ov_api import Core
-from openvino._ov_api import CompiledModel
-from openvino._ov_api import InferRequest
-from openvino._ov_api import AsyncInferQueue
+# OpenVINO API
+try:
+    # Import all public modules
+    from openvino import runtime as runtime
+    from openvino import frontend as frontend
+    from openvino import helpers as helpers
+    from openvino import preprocess as preprocess
+    from openvino import utils as utils
+    from openvino import properties as properties
 
-# Import all public modules
-from openvino import runtime as runtime
-from openvino import frontend as frontend
-from openvino import helpers as helpers
-from openvino import experimental as experimental
-from openvino import preprocess as preprocess
-from openvino import utils as utils
-from openvino import properties as properties
+    # Import most important classes and functions from openvino.runtime
+    from openvino.runtime import Model
+    from openvino.runtime import Core
+    from openvino.runtime import CompiledModel
+    from openvino.runtime import InferRequest
+    from openvino.runtime import AsyncInferQueue
 
-# Helper functions for openvino module
-from openvino.utils.data_helpers import tensor_from_file
-from openvino._ov_api import compile_model
+    from openvino.runtime import Symbol
+    from openvino.runtime import Dimension
+    from openvino.runtime import Strides
+    from openvino.runtime import PartialShape
+    from openvino.runtime import Shape
+    from openvino.runtime import Layout
+    from openvino.runtime import Type
+    from openvino.runtime import Tensor
+    from openvino.runtime import OVAny
 
+    from openvino.runtime import compile_model
+    from openvino.runtime import get_batch
+    from openvino.runtime import set_batch
+    from openvino.runtime import serialize
+    from openvino.runtime import shutdown
+    from openvino.runtime import tensor_from_file
+    from openvino.runtime import save_model
+    from openvino.runtime import layout_helpers
 
-# Import opsets
-from openvino import opset1
-from openvino import opset2
-from openvino import opset3
-from openvino import opset4
-from openvino import opset5
-from openvino import opset6
-from openvino import opset7
-from openvino import opset8
-from openvino import opset9
-from openvino import opset10
-from openvino import opset11
-from openvino import opset12
-from openvino import opset13
-from openvino import opset14
-from openvino import opset15
-from openvino import opset16
+    from openvino._pyopenvino import RemoteContext
+    from openvino._pyopenvino import RemoteTensor
+    from openvino._pyopenvino import Op
 
-# libva related:
-from openvino._pyopenvino import VAContext
-from openvino._pyopenvino import VASurfaceTensor
+    # libva related:
+    from openvino._pyopenvino import VAContext
+    from openvino._pyopenvino import VASurfaceTensor
 
-# Set version for openvino package
-from openvino._pyopenvino import get_version
-__version__ = get_version()
+    # Set version for openvino package
+    from openvino.runtime import get_version
+    __version__ = get_version()
+except ImportError:
+    import warnings
+    warnings.warn("openvino package has problems with imports!", ImportWarning, stacklevel=2)
 
 # Tools
 try:
diff --git a/tools/openvino_dev/src/openvino/__init__.py b/tools/openvino_dev/src/openvino/__init__.py
index 7643f742e0067d..b015570964c520 100644
--- a/tools/openvino_dev/src/openvino/__init__.py
+++ b/tools/openvino_dev/src/openvino/__init__.py
@@ -7,96 +7,61 @@
 # Required for Windows OS platforms
 # Note: always top-level
 try:
-    from openvino.package_utils import _add_openvino_libs_to_search_path
+    from openvino.utils import _add_openvino_libs_to_search_path
     _add_openvino_libs_to_search_path()
 except ImportError:
     pass
 
-# #
-# # OpenVINO API
-# # This __init__.py forces checking of runtime modules to propagate errors.
-# # It is not compared with init files from openvino-dev package.
-# #
-
-# Openvino pybind bindings
-from openvino._pyopenvino import AxisSet
-from openvino._pyopenvino import AxisVector
-from openvino._pyopenvino import ConstOutput
-from openvino._pyopenvino import Coordinate
-from openvino._pyopenvino import CoordinateDiff
-from openvino._pyopenvino import DiscreteTypeInfo
-from openvino._pyopenvino import Extension
-from openvino._pyopenvino import ProfilingInfo
-from openvino._pyopenvino import RTMap
-from openvino._pyopenvino import Version
-from openvino._pyopenvino import Symbol
-from openvino._pyopenvino import Dimension
-from openvino._pyopenvino import Input
-from openvino._pyopenvino import Output
-from openvino._pyopenvino import Node
-from openvino._pyopenvino import Strides
-from openvino._pyopenvino import PartialShape
-from openvino._pyopenvino import Shape
-from openvino._pyopenvino import Layout
-from openvino._pyopenvino import Type
-from openvino._pyopenvino import Tensor
-from openvino._pyopenvino import OVAny
-from openvino._pyopenvino import get_batch
-from openvino._pyopenvino import set_batch
-from openvino._pyopenvino import serialize
-from openvino._pyopenvino import shutdown
-from openvino._pyopenvino import save_model
-from openvino._pyopenvino import layout_helpers
-from openvino._pyopenvino import RemoteContext
-from openvino._pyopenvino import RemoteTensor
-from openvino._pyopenvino import Op
-
-# Import public classes from _ov_api
-from openvino._ov_api import Model
-from openvino._ov_api import Core
-from openvino._ov_api import CompiledModel
-from openvino._ov_api import InferRequest
-from openvino._ov_api import AsyncInferQueue
+# OpenVINO API
+try:
+    # Import all public modules
+    from openvino import runtime as runtime
+    from openvino import frontend as frontend
+    from openvino import helpers as helpers
+    from openvino import preprocess as preprocess
+    from openvino import utils as utils
+    from openvino import properties as properties
 
-# Import all public modules
-from openvino import runtime as runtime
-from openvino import frontend as frontend
-from openvino import helpers as helpers
-from openvino import experimental as experimental
-from openvino import preprocess as preprocess
-from openvino import utils as utils
-from openvino import properties as properties
+    # Import most important classes and functions from openvino.runtime
+    from openvino.runtime import Model
+    from openvino.runtime import Core
+    from openvino.runtime import CompiledModel
+    from openvino.runtime import InferRequest
+    from openvino.runtime import AsyncInferQueue
 
-# Helper functions for openvino module
-from openvino.utils.data_helpers import tensor_from_file
-from openvino._ov_api import compile_model
+    from openvino.runtime import Symbol
+    from openvino.runtime import Dimension
+    from openvino.runtime import Strides
+    from openvino.runtime import PartialShape
+    from openvino.runtime import Shape
+    from openvino.runtime import Layout
+    from openvino.runtime import Type
+    from openvino.runtime import Tensor
+    from openvino.runtime import OVAny
 
+    from openvino.runtime import compile_model
+    from openvino.runtime import get_batch
+    from openvino.runtime import set_batch
+    from openvino.runtime import serialize
+    from openvino.runtime import shutdown
+    from openvino.runtime import tensor_from_file
+    from openvino.runtime import save_model
+    from openvino.runtime import layout_helpers
 
-# Import opsets
-from openvino import opset1
-from openvino import opset2
-from openvino import opset3
-from openvino import opset4
-from openvino import opset5
-from openvino import opset6
-from openvino import opset7
-from openvino import opset8
-from openvino import opset9
-from openvino import opset10
-from openvino import opset11
-from openvino import opset12
-from openvino import opset13
-from openvino import opset14
-from openvino import opset15
-from openvino import opset16
+    from openvino._pyopenvino import RemoteContext
+    from openvino._pyopenvino import RemoteTensor
+    from openvino._pyopenvino import Op
 
-# libva related:
-from openvino._pyopenvino import VAContext
-from openvino._pyopenvino import VASurfaceTensor
+    # libva related:
+    from openvino._pyopenvino import VAContext
+    from openvino._pyopenvino import VASurfaceTensor
 
-# Set version for openvino package
-from openvino._pyopenvino import get_version
-__version__ = get_version()
+    # Set version for openvino package
+    from openvino.runtime import get_version
+    __version__ = get_version()
+except ImportError:
+    import warnings
+    warnings.warn("openvino package has problems with imports!", ImportWarning, stacklevel=2)
 
 # Tools
 try:
diff --git a/tools/ovc/openvino/__init__.py b/tools/ovc/openvino/__init__.py
index 7643f742e0067d..69c678909b1c9e 100644
--- a/tools/ovc/openvino/__init__.py
+++ b/tools/ovc/openvino/__init__.py
@@ -7,7 +7,7 @@
 # Required for Windows OS platforms
 # Note: always top-level
 try:
-    from openvino.package_utils import _add_openvino_libs_to_search_path
+    from openvino.utils import _add_openvino_libs_to_search_path
     _add_openvino_libs_to_search_path()
 except ImportError:
     pass
@@ -17,47 +17,6 @@
 # # This __init__.py forces checking of runtime modules to propagate errors.
 # # It is not compared with init files from openvino-dev package.
 # #
-
-# Openvino pybind bindings
-from openvino._pyopenvino import AxisSet
-from openvino._pyopenvino import AxisVector
-from openvino._pyopenvino import ConstOutput
-from openvino._pyopenvino import Coordinate
-from openvino._pyopenvino import CoordinateDiff
-from openvino._pyopenvino import DiscreteTypeInfo
-from openvino._pyopenvino import Extension
-from openvino._pyopenvino import ProfilingInfo
-from openvino._pyopenvino import RTMap
-from openvino._pyopenvino import Version
-from openvino._pyopenvino import Symbol
-from openvino._pyopenvino import Dimension
-from openvino._pyopenvino import Input
-from openvino._pyopenvino import Output
-from openvino._pyopenvino import Node
-from openvino._pyopenvino import Strides
-from openvino._pyopenvino import PartialShape
-from openvino._pyopenvino import Shape
-from openvino._pyopenvino import Layout
-from openvino._pyopenvino import Type
-from openvino._pyopenvino import Tensor
-from openvino._pyopenvino import OVAny
-from openvino._pyopenvino import get_batch
-from openvino._pyopenvino import set_batch
-from openvino._pyopenvino import serialize
-from openvino._pyopenvino import shutdown
-from openvino._pyopenvino import save_model
-from openvino._pyopenvino import layout_helpers
-from openvino._pyopenvino import RemoteContext
-from openvino._pyopenvino import RemoteTensor
-from openvino._pyopenvino import Op
-
-# Import public classes from _ov_api
-from openvino._ov_api import Model
-from openvino._ov_api import Core
-from openvino._ov_api import CompiledModel
-from openvino._ov_api import InferRequest
-from openvino._ov_api import AsyncInferQueue
-
 # Import all public modules
 from openvino import runtime as runtime
 from openvino import frontend as frontend
@@ -67,10 +26,36 @@
 from openvino import utils as utils
 from openvino import properties as properties
 
+# Import most important classes and functions from openvino.runtime
+from openvino._ov_api import Model
+from openvino._ov_api import Core
+from openvino._ov_api import CompiledModel
+from openvino._ov_api import InferRequest
+from openvino._ov_api import AsyncInferQueue
+
+from openvino.runtime import Symbol
+from openvino.runtime import Dimension
+from openvino.runtime import Strides
+from openvino.runtime import PartialShape
+from openvino.runtime import Shape
+from openvino.runtime import Layout
+from openvino.runtime import Type
+from openvino.runtime import Tensor
+from openvino.runtime import OVAny
+
 # Helper functions for openvino module
-from openvino.utils.data_helpers import tensor_from_file
+from openvino.runtime.utils.data_helpers import tensor_from_file
 from openvino._ov_api import compile_model
+from openvino.runtime import get_batch
+from openvino.runtime import set_batch
+from openvino.runtime import serialize
+from openvino.runtime import shutdown
+from openvino.runtime import save_model
+from openvino.runtime import layout_helpers
 
+from openvino._pyopenvino import RemoteContext
+from openvino._pyopenvino import RemoteTensor
+from openvino._pyopenvino import Op
 
 # Import opsets
 from openvino import opset1
@@ -95,7 +80,7 @@
 from openvino._pyopenvino import VASurfaceTensor
 
 # Set version for openvino package
-from openvino._pyopenvino import get_version
+from openvino.runtime import get_version
 __version__ = get_version()
 
 # Tools