From b319014698cfed0112e4685dceb077dbcf5ed5ad Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi3.zhang@intel.com>
Date: Wed, 8 Jan 2025 13:14:48 +0800
Subject: [PATCH] [CPU]PageAttn with 4bit-quantization (#27992)

### Details:
 - *Add new hint to set group_size for key/value cache*
- *Add grouped 4bit sym/asym quantization support for PageAttentionNode*
 - *Add grouped quantization for U8 quantization for PageAttentionNode*

### Tickets:
 - *CVS-151586*

---------

Signed-off-by: yi3.zhang@intel.com <yi3.zhang@intel.com>
Signed-off-by: Zhang Yi3 <yi3.zhang@intel.com>
Signed-off-by: Zhang Yi <yi3.zhang@intel.com>
---
 .../openvino/runtime/properties/__init__.py   |    4 +
 .../pyopenvino/core/properties/properties.cpp |    4 +
 .../tests/test_runtime/test_properties.py     |   12 +
 .../include/openvino/runtime/properties.hpp   |   24 +
 src/plugins/intel_cpu/src/compiled_model.cpp  |   12 +
 src/plugins/intel_cpu/src/config.cpp          |   86 +-
 src/plugins/intel_cpu/src/config.h            |   10 +
 .../nodes/kernels/scaled_attn/attn_quant.cpp  |  221 +++-
 .../nodes/kernels/scaled_attn/attn_quant.hpp  |    4 +-
 .../kernels/scaled_attn/attn_quant_kernel.hpp |   90 +-
 .../nodes/kernels/scaled_attn/executor_pa.cpp | 1115 ++++++++++++-----
 .../nodes/kernels/scaled_attn/executor_pa.hpp |    6 +-
 .../intel_cpu/src/nodes/paged_attn.cpp        |   31 +-
 .../intel_cpu/src/nodes/scaled_attn.cpp       |   44 +-
 src/plugins/intel_cpu/src/nodes/scaled_attn.h |    7 +-
 src/plugins/intel_cpu/src/plugin.cpp          |   12 +
 .../ov_executable_network/properties.cpp      |  138 +-
 .../custom/behavior/ov_plugin/properties.cpp  |    4 +
 18 files changed, 1413 insertions(+), 411 deletions(-)

diff --git a/src/bindings/python/src/openvino/runtime/properties/__init__.py b/src/bindings/python/src/openvino/runtime/properties/__init__.py
index 511c019be8d969..76cfefcf05d8f8 100644
--- a/src/bindings/python/src/openvino/runtime/properties/__init__.py
+++ b/src/bindings/python/src/openvino/runtime/properties/__init__.py
@@ -28,6 +28,10 @@
 from openvino._pyopenvino.properties import loaded_from_cache
 from openvino._pyopenvino.properties import cache_encryption_callbacks
 from openvino._pyopenvino.properties import weights_path
+from openvino._pyopenvino.properties import key_cache_precision
+from openvino._pyopenvino.properties import value_cache_precision
+from openvino._pyopenvino.properties import key_cache_group_size
+from openvino._pyopenvino.properties import value_cache_group_size
 
 # Submodules
 from openvino.runtime.properties import hint
diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
index d0e3ddb21644e7..2e3a2c8d1c9be8 100644
--- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
+++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
@@ -34,6 +34,10 @@ void regmodule_properties(py::module m) {
     wrap_property_RW(m_properties, ov::force_tbb_terminate, "force_tbb_terminate");
     wrap_property_RW(m_properties, ov::enable_mmap, "enable_mmap");
     wrap_property_RW(m_properties, ov::weights_path, "weights_path");
+    wrap_property_RW(m_properties, ov::key_cache_precision, "key_cache_precision");
+    wrap_property_RW(m_properties, ov::value_cache_precision, "value_cache_precision");
+    wrap_property_RW(m_properties, ov::key_cache_group_size, "key_cache_group_size");
+    wrap_property_RW(m_properties, ov::value_cache_group_size, "value_cache_group_size");
 
     wrap_property_RO(m_properties, ov::supported_properties, "supported_properties");
     wrap_property_RO(m_properties, ov::available_devices, "available_devices");
diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py
index 61fb7442987418..15e2d86ead4653 100644
--- a/src/bindings/python/tests/test_runtime/test_properties.py
+++ b/src/bindings/python/tests/test_runtime/test_properties.py
@@ -257,6 +257,18 @@ def test_properties_ro(ov_property_ro, expected_value):
             "WEIGHTS_PATH",
             (("./model.bin", "./model.bin"),),
         ),
+        (
+            props.key_cache_group_size,
+            "KEY_CACHE_GROUP_SIZE",
+            ((64, 64),),
+        ),
+        (
+            props.value_cache_group_size,
+            "VALUE_CACHE_GROUP_SIZE",
+            ((64, 64),),
+        ),
+        (props.key_cache_precision, "KEY_CACHE_PRECISION", ((Type.f32, Type.f32),)),
+        (props.value_cache_precision, "VALUE_CACHE_PRECISION", ((Type.f32, Type.f32),)),
         (hints.inference_precision, "INFERENCE_PRECISION_HINT", ((Type.f32, Type.f32),)),
         (
             hints.model_priority,
diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp
index 28538f0f60e22e..5dd3184599260e 100644
--- a/src/inference/include/openvino/runtime/properties.hpp
+++ b/src/inference/include/openvino/runtime/properties.hpp
@@ -1301,4 +1301,28 @@ static constexpr Property<std::vector<std::string>, PropertyMutability::RO> exec
  * @note This property is used for weightless caching. Only used when ov::CacheMode Property is set to "OPTIMIZE_SIZE".
  */
 static constexpr Property<std::string, PropertyMutability::RW> weights_path{"WEIGHTS_PATH"};
+
+/**
+ * @brief The precision of key cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<element::Type, PropertyMutability::RW> key_cache_precision{"KEY_CACHE_PRECISION"};
+
+/**
+ * @brief The precision of value cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<element::Type, PropertyMutability::RW> value_cache_precision{"VALUE_CACHE_PRECISION"};
+
+/**
+ * @brief The group_size of key cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<uint64_t, PropertyMutability::RW> key_cache_group_size{"KEY_CACHE_GROUP_SIZE"};
+
+/**
+ * @brief The group_size of value cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<uint64_t, PropertyMutability::RW> value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"};
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp
index 14a9b6e41c516f..2fb0cba78d19ce 100644
--- a/src/plugins/intel_cpu/src/compiled_model.cpp
+++ b/src/plugins/intel_cpu/src/compiled_model.cpp
@@ -256,6 +256,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
             RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
             RO_property(ov::hint::dynamic_quantization_group_size.name()),
             RO_property(ov::hint::kv_cache_precision.name()),
+            RO_property(ov::key_cache_precision.name()),
+            RO_property(ov::value_cache_precision.name()),
+            RO_property(ov::key_cache_group_size.name()),
+            RO_property(ov::value_cache_group_size.name()),
         };
 
         return ro_properties;
@@ -313,6 +317,14 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
         return decltype(ov::hint::dynamic_quantization_group_size)::value_type(config.fcDynamicQuantizationGroupSize);
     } else if (name == ov::hint::kv_cache_precision) {
         return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision);
+    } else if (name == ov::key_cache_precision) {
+        return decltype(ov::key_cache_precision)::value_type(config.keyCachePrecision);
+    } else if (name == ov::value_cache_precision) {
+        return decltype(ov::value_cache_precision)::value_type(config.valueCachePrecision);
+    } else if (name == ov::key_cache_group_size) {
+        return decltype(ov::key_cache_group_size)::value_type(config.keyCacheGroupSize);
+    } else if (name == ov::value_cache_group_size) {
+        return decltype(ov::value_cache_group_size)::value_type(config.valueCacheGroupSize);
     }
     OPENVINO_THROW("Unsupported property: ", name);
 }
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
index 9937c3fe1e82fa..2d2df49d1876ad 100644
--- a/src/plugins/intel_cpu/src/config.cpp
+++ b/src/plugins/intel_cpu/src/config.cpp
@@ -309,6 +309,60 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                                ov::hint::kv_cache_precision.name(),
                                ". Supported values: u8, bf16, f16, f32");
             }
+        } else if (key == ov::key_cache_precision.name()) {
+            try {
+                keyCachePrecisionSetExplicitly = true;
+                auto const prec = val.as<ov::element::Type>();
+                if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) {
+                    keyCachePrecision = prec;
+                } else {
+                    OPENVINO_THROW("keyCachePrecision doesn't support value ", prec);
+                }
+            } catch (ov::Exception&) {
+                OPENVINO_THROW("Wrong value ",
+                               val.as<std::string>(),
+                               " for property key ",
+                               ov::key_cache_precision.name(),
+                               ". Supported values: u8, bf16, f16, f32");
+            }
+        } else if (key == ov::value_cache_precision.name()) {
+            try {
+                valueCachePrecisionSetExplicitly = true;
+                auto const prec = val.as<ov::element::Type>();
+                if (one_of(prec,
+                           ov::element::f32,
+                           ov::element::f16,
+                           ov::element::bf16,
+                           ov::element::u8,
+                           ov::element::u4)) {
+                    valueCachePrecision = prec;
+                } else {
+                    OPENVINO_THROW("valueCachePrecision doesn't support value ", prec);
+                }
+            } catch (ov::Exception&) {
+                OPENVINO_THROW("Wrong value ",
+                               val.as<std::string>(),
+                               " for property key ",
+                               ov::value_cache_precision.name(),
+                               ". Supported values: u4, u8, bf16, f16, f32");
+            }
+        } else if (key == ov::key_cache_group_size.name() || key == ov::value_cache_group_size.name()) {
+            try {
+                auto const groupSize = val.as<uint64_t>();
+                if (key == ov::key_cache_group_size.name()) {
+                    keyCacheGroupSizeSetExplicitly = true;
+                    keyCacheGroupSize = groupSize;
+                } else {
+                    valueCacheGroupSizeSetExplicitly = true;
+                    valueCacheGroupSize = groupSize;
+                }
+            } catch (ov::Exception&) {
+                OPENVINO_THROW("Wrong value ",
+                               val.as<std::string>(),
+                               " for property key ",
+                               key,
+                               ". Expected only unsinged integer numbers");
+            }
         } else if (key == ov::cache_encryption_callbacks.name()) {
             try {
                 const auto& encryption_callbacks = val.as<EncryptionCallbacks>();
@@ -344,6 +398,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
         aclFastMath = true;
     }
 #endif
+    // key/value cache precision has higher priority, if not defined use kvCachePrecision
+    if (!keyCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) {
+        keyCachePrecision = kvCachePrecision;
+    }
+    if (!valueCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) {
+        valueCachePrecision = kvCachePrecision;
+    }
     // disable dynamic quantization and kv quantization for best accuracy
     if (executionMode == ov::hint::ExecutionMode::ACCURACY) {
         if (!fcDynamicQuantizationGroupSizeSetExplicitly) {
@@ -352,6 +413,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
         if (!kvCachePrecisionSetExplicitly) {
             kvCachePrecision = ov::element::f32;
         }
+        if (!keyCachePrecisionSetExplicitly) {
+            keyCachePrecision = ov::element::f32;
+        }
+        if (!valueCachePrecisionSetExplicitly) {
+            valueCachePrecision = ov::element::f32;
+        }
     }
 
     if (!prop.empty())
@@ -398,7 +465,7 @@ void Config::applyRtInfo(const std::shared_ptr<const ov::Model>& model) {
     // if user sets explicitly, it will be higher priority than rt_info
     if (!kvCachePrecisionSetExplicitly &&
         model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) {
-        this->kvCachePrecision =
+        this->kvCachePrecision = this->keyCachePrecision = this->valueCachePrecision =
             model->get_rt_info<ov::element::Type>({"runtime_options", ov::hint::kv_cache_precision.name()});
     }
     if (!fcDynamicQuantizationGroupSizeSetExplicitly &&
@@ -406,6 +473,23 @@ void Config::applyRtInfo(const std::shared_ptr<const ov::Model>& model) {
         this->fcDynamicQuantizationGroupSize =
             model->get_rt_info<uint64_t>({"runtime_options", ov::hint::dynamic_quantization_group_size.name()});
     }
+    if (!keyCachePrecisionSetExplicitly && model->has_rt_info({"runtime_options", ov::key_cache_precision.name()})) {
+        this->keyCachePrecision =
+            model->get_rt_info<ov::element::Type>({"runtime_options", ov::key_cache_precision.name()});
+    }
+    if (!valueCachePrecisionSetExplicitly &&
+        model->has_rt_info({"runtime_options", ov::value_cache_precision.name()})) {
+        this->valueCachePrecision =
+            model->get_rt_info<ov::element::Type>({"runtime_options", ov::value_cache_precision.name()});
+    }
+    if (!keyCacheGroupSizeSetExplicitly && model->has_rt_info({"runtime_options", ov::key_cache_group_size.name()})) {
+        this->keyCacheGroupSize = model->get_rt_info<uint64_t>({"runtime_options", ov::key_cache_group_size.name()});
+    }
+    if (!valueCacheGroupSizeSetExplicitly &&
+        model->has_rt_info({"runtime_options", ov::value_cache_group_size.name()})) {
+        this->valueCacheGroupSize =
+            model->get_rt_info<uint64_t>({"runtime_options", ov::value_cache_group_size.name()});
+    }
 }
 
 }  // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
index 5a347b1fa30c94..8dad1853855c56 100644
--- a/src/plugins/intel_cpu/src/config.h
+++ b/src/plugins/intel_cpu/src/config.h
@@ -48,17 +48,27 @@ struct Config {
     uint64_t fcDynamicQuantizationGroupSize = 32;
     bool fcDynamicQuantizationGroupSizeSetExplicitly = false;
     bool kvCachePrecisionSetExplicitly = false;
+    bool keyCachePrecisionSetExplicitly = false;
+    bool valueCachePrecisionSetExplicitly = false;
+    bool keyCacheGroupSizeSetExplicitly = false;
+    bool valueCacheGroupSizeSetExplicitly = false;
 #if defined(OV_CPU_WITH_ACL)
     bool aclFastMath = false;
 #endif
 #if defined(OPENVINO_ARCH_X86_64)
     ov::element::Type kvCachePrecision = ov::element::u8;
+    ov::element::Type keyCachePrecision = ov::element::u8;
+    ov::element::Type valueCachePrecision = ov::element::u8;
     size_t rtCacheCapacity = 5000ul;
 #else
     ov::element::Type kvCachePrecision = ov::element::f16;
+    ov::element::Type keyCachePrecision = ov::element::f16;
+    ov::element::Type valueCachePrecision = ov::element::f16;
     // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives
     size_t rtCacheCapacity = 0ul;
 #endif
+    size_t keyCacheGroupSize = 0ul;
+    size_t valueCacheGroupSize = 0ul;
     ov::threading::IStreamsExecutor::Config streamExecutorConfig;
     int streams = 1;
     bool streamsChanged = false;
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp
index 095180d659142e..26282a70fcb512 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp
@@ -27,10 +27,10 @@ namespace XARCH {
 using namespace ov;
 
 template <typename T>
-static void quant_u8(const T* src, uint8_t* dst, size_t n, float& scale, float& zp) {
+static void find_minmax(const T* src, size_t n, float& min, float& max) {
+    max = -FLT_MAX;
+    min = FLT_MAX;
     size_t i = 0;
-    float max = -FLT_MAX;
-    float min = FLT_MAX;
 #if defined(HAVE_AVX512F)
     auto v0_max = _mm512_set1_ps(-FLT_MAX);
     auto v0_min = _mm512_set1_ps(FLT_MAX);
@@ -131,12 +131,18 @@ static void quant_u8(const T* src, uint8_t* dst, size_t n, float& scale, float&
         max = std::max(max, tmp);
         min = std::min(min, tmp);
     }
+}
+
+template <typename T>
+static void quant_u8(const T* src, uint8_t* dst, size_t n, float& scale, float& zp) {
+    size_t i = 0;
+    float max = -FLT_MAX;
+    float min = FLT_MAX;
+    find_minmax(src, n, min, max);
     scale = (max - min) / 255;
     if (scale == 0)
         scale = 0.0001f;
     zp = -min / scale;
-
-    i = 0;
 #if defined(HAVE_AVX512F)
     auto v_scale = _mm512_set1_ps(1 / scale);
     auto v_zp = _mm512_set1_ps(zp);
@@ -170,6 +176,116 @@ static void quant_u8(const T* src, uint8_t* dst, size_t n, float& scale, float&
     }
 }
 
+template <typename T>
+static void quant_u4(const T* src, void* dst, size_t n, float& scale, float& zp) {
+    size_t i = 0;
+    float max = -FLT_MAX;
+    float min = FLT_MAX;
+    find_minmax(src, n, min, max);
+    auto insert_half_byte = [](uint8_t dst, uint8_t val, bool high_half) -> uint8_t {
+        uint8_t shift = high_half ? 0 : 4;
+        return dst | (uint8_t)(val << shift);
+    };
+    auto dst_ptr = reinterpret_cast<uint8_t*>(dst);
+    scale = (max - min) / ((1 << 4) - 1);
+    if (scale == 0)
+        scale = 0.0001f;
+    zp = -min / scale;
+#if defined(HAVE_AVX512F)
+    auto v_scale = _mm512_set1_ps(1 / scale);
+    auto v_zp = _mm512_set1_ps(zp);
+    auto v_zero = _mm512_setzero_epi32();
+    auto v_upper = _mm512_set1_epi32(15);
+    for (; i + 2 * vec_len_f32_avx512 <= n; i += 2 * vec_len_f32_avx512) {
+        auto v0 = mm512_uni_loadu_ps(src + i);
+        auto v1 = mm512_uni_loadu_ps(src + i + vec_len_f32_avx512);
+        v0 = _mm512_fmadd_ps(v0, v_scale, v_zp);
+        v1 = _mm512_fmadd_ps(v1, v_scale, v_zp);
+        auto v0_i32 = _mm512_cvt_roundps_epi32(v0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        auto v1_i32 = _mm512_cvt_roundps_epi32(v1, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        v0_i32 = _mm512_max_epi32(v0_i32, v_zero);
+        v1_i32 = _mm512_max_epi32(v1_i32, v_zero);
+        v0_i32 = _mm512_min_epi32(v0_i32, v_upper);
+        v1_i32 = _mm512_min_epi32(v1_i32, v_upper);
+        __m512i idx1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+        __m512i idx2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+        auto first_half = _mm512_permutex2var_epi32(v0_i32, idx1, v1_i32);
+        auto second_half = _mm512_permutex2var_epi32(v0_i32, idx2, v1_i32);
+        first_half = _mm512_slli_epi32(first_half, 4);
+        auto mask = _mm512_set1_epi32(0x0F);
+        second_half = _mm512_and_epi32(second_half, mask);
+        auto combined = _mm512_or_epi32(first_half, second_half);
+        _mm512_mask_cvtepi32_storeu_epi8(dst_ptr + i / 2, 0xffff, combined);
+    }
+#endif
+#if defined(HAVE_AVX2)
+    auto v256_zero = _mm256_set1_epi32(0);
+    auto v256_upper = _mm256_set1_epi32(15);
+    auto v256_scale = _mm256_set1_ps(1 / scale);
+    auto v256_zp = _mm256_set1_ps(zp);
+    for (; i + vec_len_f32_avx2 * 2 <= n; i += vec_len_f32_avx2 * 2) {
+        auto v0 = mm256_uni_loadu_ps(src + i);
+        auto v1 = mm256_uni_loadu_ps(src + i + vec_len_f32_avx2);
+        v0 = _mm256_fmadd_ps(v0, v256_scale, v256_zp);
+        v1 = _mm256_fmadd_ps(v1, v256_scale, v256_zp);
+        v0 = _mm256_round_ps(v0, _MM_ROUND_NEAREST);
+        v1 = _mm256_round_ps(v1, _MM_ROUND_NEAREST);
+
+        auto v0_i32 = _mm256_cvtps_epi32(v0);
+        auto v1_i32 = _mm256_cvtps_epi32(v1);
+        v0_i32 = _mm256_max_epi32(v0_i32, v256_zero);
+        v1_i32 = _mm256_max_epi32(v1_i32, v256_zero);
+        v0_i32 = _mm256_min_epi32(v0_i32, v256_upper);
+        v1_i32 = _mm256_min_epi32(v1_i32, v256_upper);
+        auto idx1 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
+        v0_i32 = _mm256_permutevar8x32_epi32(v0_i32, idx1);
+        v1_i32 = _mm256_permutevar8x32_epi32(v1_i32, idx1);
+        //    0,1,2,3,4,5,6,7 | 8,9,10,11,12,13,14,15
+        //       _mm256_permutevar8x32_epi32
+        //    0,2,4,6,1,3,5,7 | 8,10,12,14,9,11,13,15
+        //       _mm256_permute2x128_si256
+        // 0,2,4,6,8,10,12,14 | 1,3,5,7,9,11,13,15
+        //          shift + mask + or
+        //     [0,1],[2,3], ..., [12,13], [14,15]
+        auto first_half = _mm256_permute2x128_si256(v0_i32, v1_i32, 0x20);
+        auto second_half = _mm256_permute2x128_si256(v0_i32, v1_i32, 0x31);
+        first_half = _mm256_slli_epi32(first_half, 4);
+        auto mask = _mm256_set1_epi32(0x0F);
+        second_half = _mm256_and_si256(second_half, mask);
+        auto combined = _mm256_or_si256(first_half, second_half);
+
+        auto high4 = _mm256_extractf128_si256(combined, 1);
+        auto low4 = _mm256_castsi256_si128(combined);
+        // ignore sign bit for u4 case
+        auto packed = _mm_packus_epi32(low4, high4);
+        packed = _mm_packus_epi16(packed, packed);
+        _mm_storel_epi64(reinterpret_cast<__m128i*>(dst_ptr + i / 2), packed);
+    }
+#endif
+    for (; i < n; i++) {
+        float tmp = src[i];
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+        uint8_t src_val = MIN(15, (uint8_t)(std::round(tmp / scale + zp)));
+        uint8_t dst_val = i % 2 == 0 ? 0 : dst_ptr[i / 2];
+        dst_val = insert_half_byte(dst_val, src_val, (uint8_t)(i % 2));
+        dst_ptr[i / 2] = dst_val;
+    }
+}
+
+template <typename T,
+          ov::element::Type_t DST_PREC,
+          typename std::enable_if<DST_PREC == ov::element::u8, bool>::type = true>
+static void quantize(const T* src, uint8_t* dst, size_t n, float* scale_zp) {
+    quant_u8(src, dst, n, *scale_zp, *(scale_zp + 1));
+}
+
+template <typename T,
+          ov::element::Type_t DST_PREC,
+          typename std::enable_if<DST_PREC == ov::element::u4, bool>::type = true>
+static void quantize(const T* src, void* dst, size_t n, float* scale_zp) {
+    quant_u4(src, dst, n, *scale_zp, *(scale_zp + 1));
+}
+
 template <typename T, typename T2>
 static void attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src,
                           const ov::intel_cpu::PlainTensor& v_src,
@@ -187,36 +303,55 @@ static void attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src,
     });
 }
 
-template <typename T, typename T2>
+template <typename T, ov::element::Type_t KEY_DST_PREC, ov::element::Type_t VALUE_DST_PREC>
 static void paged_attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src,
                                 const ov::intel_cpu::PlainTensor& v_src,
                                 const ov::intel_cpu::PlainTensor& k_dst,
                                 const ov::intel_cpu::PlainTensor& v_dst,
-                                const ov::intel_cpu::PlainTensor& slot_mapping) {
+                                const ov::intel_cpu::PlainTensor& slot_mapping,
+                                const size_t key_group_size,
+                                const size_t value_group_size) {
     size_t B = k_src.m_dims[0], H = k_src.m_dims[1], L1 = k_src.m_dims[2], S = k_src.m_dims[3], SV = v_src.m_dims[3];
     size_t block_size = k_dst.m_dims[2];
+    size_t sub_byte_multiplier = 8 / v_dst.get_precision().bitwidth();
     parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) {
         auto slot = slot_mapping.ptr<int32_t>(b)[m];
         if (slot < 0)
             return;
         auto block_number = slot / block_size;
         auto block_offset = slot % block_size;
-
-        auto p_k = reinterpret_cast<float*>(k_dst.ptr<T2>(block_number, h, block_offset));
-        auto p_v = reinterpret_cast<float*>(v_dst.ptr<T2>(block_number, h, block_offset));
         // The layout for per token per head:
         // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized
         // feature(u8,idx_S)|
-        quant_u8(k_src.ptr<T>(b, h, m),
-                 k_dst.ptr<T2>(block_number, h, block_offset) + sizeof(float) + sizeof(float),
-                 S,
-                 p_k[0],
-                 p_k[1]);
-        quant_u8(v_src.ptr<T>(b, h, m),
-                 v_dst.ptr<T2>(block_number, h, block_offset) + sizeof(float) + sizeof(float),
-                 SV,
-                 p_v[0],
-                 p_v[1]);
+        for (size_t src_offset = 0, dst_offset = 0; src_offset < S;
+             src_offset += key_group_size, dst_offset += key_group_size + sizeof(float) + sizeof(float)) {
+            auto p_k = reinterpret_cast<float*>(
+                k_dst.ptr<typename ov::element_type_traits<KEY_DST_PREC>::value_type>(block_number,
+                                                                                      h,
+                                                                                      block_offset,
+                                                                                      dst_offset));
+            quantize<T, KEY_DST_PREC>(
+                k_src.ptr<T>(b, h, m, src_offset),
+                k_dst.ptr<typename ov::element_type_traits<KEY_DST_PREC>::value_type>(block_number,
+                                                                                      h,
+                                                                                      block_offset,
+                                                                                      dst_offset) +
+                    sizeof(float) + sizeof(float),
+                key_group_size,
+                p_k);
+        }
+
+        for (size_t src_offset = 0, dst_offset = 0; src_offset < SV; src_offset += value_group_size,
+                    dst_offset += value_group_size / sub_byte_multiplier + sizeof(float) + sizeof(float)) {
+            uint8_t* v_base = reinterpret_cast<uint8_t*>(
+                v_dst.m_ptr.get() +
+                (block_number * v_dst.m_strides[0] + h * v_dst.m_strides[1] + block_offset * v_dst.m_strides[2]) /
+                    sub_byte_multiplier +
+                dst_offset);
+            auto p_v = reinterpret_cast<float*>(v_base);
+            uint8_t* v_ptr = v_base + sizeof(float) * 2;
+            quantize<T, VALUE_DST_PREC>(v_src.ptr<T>(b, h, m, src_offset), v_ptr, value_group_size, p_v);
+        }
     });
 }
 
@@ -245,20 +380,48 @@ void paged_attn_quantkv(const ov::intel_cpu::PlainTensor& k_src,
                         const ov::intel_cpu::PlainTensor& v_src,
                         const ov::intel_cpu::PlainTensor& k_dst,
                         const ov::intel_cpu::PlainTensor& v_dst,
-                        const ov::intel_cpu::PlainTensor& slot_mapping) {
-    if (k_src.get_precision() == ov::element::f32 && k_dst.get_precision() == ov::element::u8) {
-        paged_attn_quant_mt<float, uint8_t>(k_src, v_src, k_dst, v_dst, slot_mapping);
-    } else if (k_src.get_precision() == ov::element::bf16 && k_dst.get_precision() == ov::element::u8) {
-        paged_attn_quant_mt<ov::bfloat16, uint8_t>(k_src, v_src, k_dst, v_dst, slot_mapping);
-    } else if (k_src.get_precision() == ov::element::f16 && k_dst.get_precision() == ov::element::u8) {
-        paged_attn_quant_mt<ov::float16, uint8_t>(k_src, v_src, k_dst, v_dst, slot_mapping);
-    } else {
+                        const ov::intel_cpu::PlainTensor& slot_mapping,
+                        const size_t key_group_size,
+                        const size_t value_group_size) {
+    using function_type = void (*)(const ov::intel_cpu::PlainTensor&,
+                                   const ov::intel_cpu::PlainTensor&,
+                                   const ov::intel_cpu::PlainTensor&,
+                                   const ov::intel_cpu::PlainTensor&,
+                                   const ov::intel_cpu::PlainTensor&,
+                                   const size_t,
+                                   const size_t);
+    static constexpr function_type funcs_fp32[] = {
+        paged_attn_quant_mt<float, ov::element::u8, ov::element::u8>,
+        paged_attn_quant_mt<float, ov::element::u8, ov::element::u4>,
+    };
+    static constexpr function_type funcs_bf16[] = {
+        paged_attn_quant_mt<ov::bfloat16, ov::element::u8, ov::element::u8>,
+        paged_attn_quant_mt<ov::bfloat16, ov::element::u8, ov::element::u4>,
+    };
+    static constexpr function_type funcs_f16[] = {
+        paged_attn_quant_mt<ov::float16, ov::element::u8, ov::element::u8>,
+        paged_attn_quant_mt<ov::float16, ov::element::u8, ov::element::u4>,
+    };
+    if (k_dst.get_precision() != ov::element::u8) {
         OPENVINO_THROW("unsupport src type: ",
                        k_src.get_precision(),
                        ", dst type: ",
                        k_dst.get_precision(),
                        " in paged_attn_quantkv");
     }
+    std::map<ov::element::Type, size_t> dispatch_table = {
+        {ov::element::u8, 0},
+        {ov::element::u4, 1},
+        {ov::element::i4, 2},
+    };
+    size_t dispatch = dispatch_table[v_dst.get_precision()];
+    if (k_src.get_precision() == ov::element::f32) {
+        funcs_fp32[dispatch](k_src, v_src, k_dst, v_dst, slot_mapping, key_group_size, value_group_size);
+    } else if (k_src.get_precision() == ov::element::bf16) {
+        funcs_bf16[dispatch](k_src, v_src, k_dst, v_dst, slot_mapping, key_group_size, value_group_size);
+    } else if (k_src.get_precision() == ov::element::f16) {
+        funcs_f16[dispatch](k_src, v_src, k_dst, v_dst, slot_mapping, key_group_size, value_group_size);
+    }
 }
 
 void attn_quant_u8(const float* src, uint8_t* dst, size_t n, float& scale, float& zp) {
@@ -266,7 +429,7 @@ void attn_quant_u8(const float* src, uint8_t* dst, size_t n, float& scale, float
 }
 
 void attn_dequant_u8(const uint8_t* src, float* dst, size_t n, float scale, float zp) {
-    attn_dequant_u8_kernel(src, dst, n, scale, zp);
+    attn_dequant_kernel<float, ov::element::u8>(src, dst, n, scale, zp);
 }
 
 }  // namespace XARCH
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp
index 2f39f74f5b3460..364e5775861ed2 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp
@@ -27,7 +27,9 @@ void paged_attn_quantkv(const ov::intel_cpu::PlainTensor& k_src,
                         const ov::intel_cpu::PlainTensor& v_src,
                         const ov::intel_cpu::PlainTensor& k_dst,
                         const ov::intel_cpu::PlainTensor& v_dst,
-                        const ov::intel_cpu::PlainTensor& slot_mapping);
+                        const ov::intel_cpu::PlainTensor& slot_mapping,
+                        const size_t key_group_size,
+                        const size_t value_group_size);
 
 void attn_quant_u8(const float* src, uint8_t* dst, size_t n, float& scale, float& zp);
 
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp
index 759d0005103871..761a136eda2997 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp
@@ -17,8 +17,10 @@ namespace Extensions {
 namespace Cpu {
 namespace XARCH {
 
-template <typename TDST>
-void attn_dequant_u8_kernel(const uint8_t* src, TDST* dst, size_t n, float scale, float zp) {
+template <typename TDST,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<SRC_PREC == ov::element::u8, bool>::type = true>
+void attn_dequant_kernel(const uint8_t* src, TDST* dst, size_t n, float scale, float zp) {
     size_t i = 0;
     // loadu_si128/epi64 does not support const qualifier
     uint8_t* src_nc = const_cast<uint8_t*>(src);
@@ -52,6 +54,90 @@ void attn_dequant_u8_kernel(const uint8_t* src, TDST* dst, size_t n, float scale
     }
 }
 
+template <typename TDST,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<SRC_PREC == ov::element::u4, bool>::type = true>
+void attn_dequant_kernel(const uint8_t* src, TDST* dst, size_t n, float scale, float zp) {
+    // 2 4bit data form a byte
+    /* 0,1|2,3|4,5|6,7
+          /      \
+       0,2,4,6|1,3,5,7
+              |
+           permute
+              |
+       0,1,2,3,4,5,6,7
+    */
+    size_t i = 0;
+    uint8_t* src_nc = const_cast<uint8_t*>(src);
+#if defined(HAVE_AVX512F)
+    auto v_scale = _mm512_set1_ps(scale);
+    auto v_zp_scale = _mm512_set1_ps(zp * scale);
+    for (; i + vec_len_f32_avx512 * 2 <= n; i += vec_len_f32_avx512 * 2) {
+        auto data = _mm_loadu_si128(reinterpret_cast<__m128i*>(src_nc + i / 2));
+        auto v_i32 = _mm512_cvtepu8_epi32(data);
+
+        auto v_512_low_half = _mm512_srli_epi32(v_i32, 4);
+        auto v_f32_low_half = _mm512_cvtepi32_ps(v_512_low_half);
+
+        auto mask = _mm512_set1_epi32(0x0F);
+        auto v_512_high_half = _mm512_and_si512(v_i32, mask);
+        auto v_f32_high_half = _mm512_cvtepi32_ps(v_512_high_half);
+        // q * scale- zp * scale
+        v_f32_low_half = _mm512_fmsub_ps(v_f32_low_half, v_scale, v_zp_scale);
+        v_f32_high_half = _mm512_fmsub_ps(v_f32_high_half, v_scale, v_zp_scale);
+        __m512i idx1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
+        __m512i idx2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+        __m512 first_half = _mm512_permutex2var_ps(v_f32_low_half, idx1, v_f32_high_half);
+        __m512 second_half = _mm512_permutex2var_ps(v_f32_low_half, idx2, v_f32_high_half);
+        mm512_uni_storeu_ps(dst + i, first_half);
+        mm512_uni_storeu_ps(dst + i + vec_len_f32_avx512, second_half);
+    }
+#elif defined(HAVE_AVX2)
+    auto v256_zp = _mm256_set1_ps(zp);
+    auto v256_scale = _mm256_set1_ps(scale);
+    for (; i + vec_len_f32_avx2 * 2 <= n; i += vec_len_f32_avx2 * 2) {
+        auto data = _mm_loadl_epi64(reinterpret_cast<__m128i*>(src_nc + i / 2));
+
+        auto v_i32 = _mm256_cvtepu8_epi32(data);
+        auto v_256_low_half = _mm256_srli_epi32(v_i32, 4);
+        auto v_f32_low_half = _mm256_cvtepi32_ps(v_256_low_half);
+
+        auto mask = _mm256_set1_epi32(0x0F);
+        auto v_256_high_half = _mm256_and_si256(v_i32, mask);
+        auto v_f32_high_half = _mm256_cvtepi32_ps(v_256_high_half);
+        // q - zp
+        v_f32_low_half = _mm256_sub_ps(v_f32_low_half, v256_zp);
+        v_f32_high_half = _mm256_sub_ps(v_f32_high_half, v256_zp);
+
+        v_f32_low_half = _mm256_mul_ps(v_f32_low_half, v256_scale);
+        v_f32_high_half = _mm256_mul_ps(v_f32_high_half, v256_scale);
+
+        // 0,2,4,6,8,10,12,14 | 1,3,5,7,9,11,13,15
+        //         _mm256_permute2f128_ps
+        // 0,2,4,6,1,3,5,7    | 8,10,12,14,9,11,13,15
+        //         _mm256_permutevar8x32_ps
+        // 0,1,2,3,4,5,6,7    | 8,9,10,11,12,13,14,15
+        __m256 first_half = _mm256_permute2f128_ps(v_f32_low_half, v_f32_high_half, 0x20);
+        auto idx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+        first_half = _mm256_permutevar8x32_ps(first_half, idx1);
+        __m256 second_half = _mm256_permute2f128_ps(v_f32_low_half, v_f32_high_half, 0x31);
+        second_half = _mm256_permutevar8x32_ps(second_half, idx1);
+
+        mm256_uni_storeu_ps(dst + i, first_half);
+        mm256_uni_storeu_ps(dst + i + vec_len_f32_avx2, second_half);
+    }
+#endif
+    auto extract_half_byte = [&](uint8_t val, bool high_half) -> uint8_t {
+        uint8_t shift = high_half ? 0 : 4;
+        return (uint8_t)((val >> shift) & 0x000F);
+    };
+    for (; i < n; ++i) {
+        float tmp = extract_half_byte(src_nc[i / 2], (uint8_t)(i % 2));
+        tmp = (tmp - zp) * scale;
+        dst[i] = tmp;
+    }
+}
+
 }  // namespace XARCH
 }  // namespace Cpu
 }  // namespace Extensions
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
index a74021d8ac0d05..955e7687ef97b3 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
@@ -72,8 +72,22 @@ void cvt_copy(TA* dst, TB* src, size_t n) {
     }
 }
 
-template <typename T>
-static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size_t block_size) {
+size_t inline get_sub_byte_multiplier(ov::element::Type type) {
+    return one_of(type, ov::element::i4, ov::element::u4) ? 8 / type.bitwidth() : 1;
+}
+
+template <typename T,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value ||
+                                   std::is_same<T, float>::value) &&
+                                      (SRC_PREC != ov::element::u8 || SRC_PREC != ov::element::u4),
+                                  bool>::type = true>
+static void attn_acc_value_block(float* out,
+                                 float* weight,
+                                 T* v,
+                                 const size_t S,
+                                 const size_t block_size,
+                                 const size_t group_size) {
 #    if defined(HAVE_AVX512F)
     size_t j = 0;
     for (; j + 4 <= block_size; j += 4) {
@@ -200,117 +214,262 @@ static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size
         v += S;
     }
 }
-
-static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S, size_t block_size) {
+template <typename T,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<SRC_PREC == ov::element::u8, bool>::type = true>
+static void attn_acc_value_block(float* out,
+                                 float* weight,
+                                 uint8_t* v,
+                                 const size_t S,
+                                 const size_t block_size,
+                                 const size_t group_size) {
     // The layout for per token per head:
     // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized
     // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float)
+    size_t src_offset = 0;
+    size_t dst_offset = 0;
+    const size_t params_offset = sizeof(float) * 2;
+    const size_t src_stride = S / group_size * (group_size + params_offset);
+
 #    if defined(HAVE_AVX512F)
     size_t j = 0;
     for (; j + 4 <= block_size; j += 4) {
-        auto v_f0 = reinterpret_cast<float*>(v);
-        auto v_f1 = reinterpret_cast<float*>(v + S + 8);
-        auto v_f2 = reinterpret_cast<float*>(v + 2 * (S + 8));
-        auto v_f3 = reinterpret_cast<float*>(v + 3 * (S + 8));
-        auto attn_w_vec0 = _mm512_set1_ps(weight[0] * v_f0[0]);
-        auto attn_w_vec1 = _mm512_set1_ps(weight[1] * v_f1[0]);
-        auto attn_w_vec2 = _mm512_set1_ps(weight[2] * v_f2[0]);
-        auto attn_w_vec3 = _mm512_set1_ps(weight[3] * v_f3[0]);
-        auto zp0 = _mm512_set1_ps(v_f0[1]);
-        auto zp1 = _mm512_set1_ps(v_f1[1]);
-        auto zp2 = _mm512_set1_ps(v_f2[1]);
-        auto zp3 = _mm512_set1_ps(v_f3[1]);
-        size_t i = 0;
-        v += 8;
-        for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) {
-            auto v_out = mm512_uni_loadu_ps(out + i);
-            auto v0 = _mm512_sub_ps(
-                _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))),
-                zp0);
-            auto v1 = _mm512_sub_ps(
-                _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + S + 8)))),
-                zp1);
-            auto v2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
-                                        _mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 2 * (S + 8))))),
-                                    zp2);
-            auto v3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
-                                        _mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 3 * (S + 8))))),
-                                    zp3);
-            v_out = _mm512_fmadd_ps(attn_w_vec0, v0, v_out);
-            v_out = _mm512_fmadd_ps(attn_w_vec1, v1, v_out);
-            v_out = _mm512_fmadd_ps(attn_w_vec2, v2, v_out);
-            v_out = _mm512_fmadd_ps(attn_w_vec3, v3, v_out);
-
-            _mm512_storeu_ps(out + i, v_out);
-        }
-        for (; i < S; i++) {
-            out[i] += weight[0] * (v[i] - v_f0[1]) * v_f0[0];
-            out[i] += weight[1] * (v[i + S + 8] - v_f1[1]) * v_f1[0];
-            out[i] += weight[2] * (v[i + 2 * (S + 8)] - v_f2[1]) * v_f2[0];
-            out[i] += weight[3] * (v[i + 3 * (S + 8)] - v_f3[1]) * v_f3[0];
+        dst_offset = 0;
+        src_offset = 0;
+        while (dst_offset < S) {
+            // process group by group
+            uint8_t* v_ptr = v + src_offset;
+            auto v_f0 = reinterpret_cast<float*>(v_ptr);
+            auto v_f1 = reinterpret_cast<float*>(v_ptr + src_stride);
+            auto v_f2 = reinterpret_cast<float*>(v_ptr + 2 * src_stride);
+            auto v_f3 = reinterpret_cast<float*>(v_ptr + 3 * src_stride);
+            auto attn_w_vec0 = _mm512_set1_ps(weight[0] * v_f0[0]);
+            auto attn_w_vec1 = _mm512_set1_ps(weight[1] * v_f1[0]);
+            auto attn_w_vec2 = _mm512_set1_ps(weight[2] * v_f2[0]);
+            auto attn_w_vec3 = _mm512_set1_ps(weight[3] * v_f3[0]);
+            auto zp0 = _mm512_set1_ps(v_f0[1]);
+            auto zp1 = _mm512_set1_ps(v_f1[1]);
+            auto zp2 = _mm512_set1_ps(v_f2[1]);
+            auto zp3 = _mm512_set1_ps(v_f3[1]);
+            uint8_t* v_data_ptr = v + src_offset + params_offset;
+            size_t i = 0;
+            for (; i + vec_len_f32_avx512 <= group_size; i += vec_len_f32_avx512) {
+                auto v_out = mm512_uni_loadu_ps(out + dst_offset + i);
+                auto v0 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
+                                            _mm_loadu_si128(reinterpret_cast<__m128i*>(v_data_ptr + i)))),
+                                        zp0);
+                auto v1 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
+                                            _mm_loadu_si128(reinterpret_cast<__m128i*>(v_data_ptr + i + src_stride)))),
+                                        zp1);
+                auto v2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(
+                                            reinterpret_cast<__m128i*>(v_data_ptr + i + 2 * src_stride)))),
+                                        zp2);
+                auto v3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(
+                                            reinterpret_cast<__m128i*>(v_data_ptr + i + 3 * src_stride)))),
+                                        zp3);
+                v_out = _mm512_fmadd_ps(attn_w_vec0, v0, v_out);
+                v_out = _mm512_fmadd_ps(attn_w_vec1, v1, v_out);
+                v_out = _mm512_fmadd_ps(attn_w_vec2, v2, v_out);
+                v_out = _mm512_fmadd_ps(attn_w_vec3, v3, v_out);
+                _mm512_storeu_ps(out + dst_offset + i, v_out);
+            }
+            for (; i < group_size; i++) {
+                out[i] += weight[0] * (v_data_ptr[i] - v_f0[1]) * v_f0[0];
+                out[i] += weight[1] * (v_data_ptr[i + src_stride] - v_f1[1]) * v_f1[0];
+                out[i] += weight[2] * (v_data_ptr[i + 2 * src_stride] - v_f2[1]) * v_f2[0];
+                out[i] += weight[3] * (v_data_ptr[i + 3 * src_stride] - v_f3[1]) * v_f3[0];
+            }
+            dst_offset += group_size;
+            src_offset += group_size + params_offset;
         }
-        v += 4 * (S + 8) - 8;
         weight += 4;
+        v += 4 * src_stride;
     }
     for (; j < block_size; j++) {
-        auto v_f0 = reinterpret_cast<float*>(v);
-        auto attn_w_vec0 = _mm512_set1_ps(weight[0] * v_f0[0]);
-        auto zp0 = _mm512_set1_ps(v_f0[1]);
-        size_t i = 0;
-        v += 8;
-        for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) {
-            auto v_out = mm512_uni_loadu_ps(out + i);
-            auto v0 = _mm512_sub_ps(
-                _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))),
-                zp0);
-            v_out = _mm512_fmadd_ps(attn_w_vec0, v0, v_out);
-
-            _mm512_storeu_ps(out + i, v_out);
-        }
-        for (; i < S; i++) {
-            out[i] += weight[0] * (v[i] - v_f0[1]) * v_f0[0];
+        dst_offset = 0;
+        src_offset = 0;
+        while (dst_offset < S) {
+            uint8_t* v_ptr = v + src_offset;
+            uint8_t* v_data_ptr = v_ptr + params_offset;
+            auto v_f0 = reinterpret_cast<float*>(v_ptr);
+            auto attn_w_vec0 = _mm512_set1_ps(weight[0] * v_f0[0]);
+            auto zp0 = _mm512_set1_ps(v_f0[1]);
+            size_t i = 0;
+            for (; i + vec_len_f32_avx512 <= group_size; i += vec_len_f32_avx512) {
+                auto v_out = mm512_uni_loadu_ps((out + dst_offset + i));
+                auto v0 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
+                                            _mm_loadu_si128(reinterpret_cast<__m128i*>(v_data_ptr + i)))),
+                                        zp0);
+                v_out = _mm512_fmadd_ps(attn_w_vec0, v0, v_out);
+
+                _mm512_storeu_ps((out + dst_offset + i), v_out);
+            }
+            for (; i < group_size; i++) {
+                out[dst_offset + i] += weight[0] * (v_data_ptr[i] - v_f0[1]) * v_f0[0];
+            }
+            dst_offset += group_size;
+            src_offset += group_size + params_offset;
         }
-        v += S;
+        v += src_stride;
         weight++;
     }
     return;
 #    elif defined(HAVE_AVX2)
     size_t j = 0;
     for (; j < block_size; j++) {
-        auto v_f0 = reinterpret_cast<float*>(v);
-        auto attn_w_vec0 = _mm256_set1_ps(weight[0] * v_f0[0]);
-        auto zp0 = _mm256_set1_ps(v_f0[1]);
-        size_t i = 0;
-        v += 8;
-        for (; i + vec_len_f32_avx2 <= S; i += vec_len_f32_avx2) {
-            auto v_out = mm256_uni_loadu_ps(out + i);
-            auto v0 = _mm256_sub_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(v + i)))),
-                zp0);
-            v_out = _mm256_fmadd_ps(attn_w_vec0, v0, v_out);
-
-            mm256_uni_storeu_ps(out + i, v_out);
-        }
-        for (; i < S; i++) {
-            out[i] += weight[0] * (v[i] - v_f0[1]) * v_f0[0];
+        dst_offset = 0;
+        src_offset = 0;
+        while (dst_offset < S) {
+            uint8_t* v_ptr = v + src_offset;
+            uint8_t* v_data_ptr = v_ptr + params_offset;
+            auto v_f0 = reinterpret_cast<float*>(v_ptr);
+            auto attn_w_vec0 = _mm256_set1_ps(weight[0] * v_f0[0]);
+            auto zp0 = _mm256_set1_ps(v_f0[1]);
+            size_t i = 0;
+            v += 8;
+            for (; i + vec_len_f32_avx2 <= group_size; i += vec_len_f32_avx2) {
+                auto v_out = mm256_uni_loadu_ps(out + dst_offset + i);
+                auto v0 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                                            _mm_loadl_epi64(reinterpret_cast<__m128i*>(v_data_ptr + i)))),
+                                        zp0);
+                v_out = _mm256_fmadd_ps(attn_w_vec0, v0, v_out);
+
+                mm256_uni_storeu_ps(out + dst_offset + i, v_out);
+            }
+            for (; i < group_size; i++) {
+                out[dst_offset + i] += weight[0] * (v_data_ptr[i] - v_f0[1]) * v_f0[0];
+            }
+            dst_offset += group_size;
+            src_offset += group_size + params_offset;
         }
-        v += S;
+        v += src_stride;
         weight++;
     }
     return;
 #    endif
     for (size_t j = 0; j < block_size; j++) {
-        auto v0 = reinterpret_cast<float*>(v);
-        v += 8;
-        for (size_t i = 0; i < S; i++) {
-            out[i] += weight[j] * (v[i] - v0[1]) * v0[0];
+        dst_offset = 0;
+        src_offset = 0;
+        while (dst_offset < S) {
+            auto v0 = reinterpret_cast<float*>(v + src_offset);
+            for (size_t i = 0; i < group_size; i++) {
+                out[dst_offset + i] += weight[j] * (v[i + src_offset + params_offset] - v0[1]) * v0[0];
+            }
+            dst_offset += group_size;
+            src_offset += group_size + params_offset;
         }
-        v += S;
+        v += src_stride;
+    }
+}
+
+template <typename T,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<SRC_PREC == ov::element::u4, bool>::type = true>
+static void attn_acc_value_block(float* out,
+                                 float* weight,
+                                 void* v,
+                                 const size_t S,
+                                 const size_t block_size,
+                                 const size_t group_size) {
+    size_t src_offset = 0;
+    size_t dst_offset = 0;
+    const size_t params_offset = sizeof(float) * 2;
+    uint8_t* v_ptr = reinterpret_cast<uint8_t*>(v);
+    auto sub_byte_multiplier = 8 / 4;
+    const size_t src_stride = S / group_size * (group_size / sub_byte_multiplier + params_offset);
+    auto extract_half_byte = [](uint8_t val, bool high_half) -> uint8_t {
+        uint8_t shift = high_half ? 0 : 4;
+
+        return (uint8_t)((val >> shift) & 0x000F);
+    };
+    for (size_t j = 0; j < block_size; j++) {
+        dst_offset = 0;
+        src_offset = 0;
+        while (dst_offset < S) {
+            auto v0 = reinterpret_cast<float*>(v_ptr + src_offset);
+            size_t i = 0;
+#    if defined(HAVE_AVX512F)
+            auto attn_w_vec0 = _mm512_set1_ps(weight[j] * v0[0]);
+            auto v_zp = _mm512_set1_ps(v0[1]);
+            for (; i + vec_len_f32_avx512 * 2 <= group_size; i += vec_len_f32_avx512 * 2) {
+                auto data = _mm_loadu_si128(reinterpret_cast<__m128i*>(v_ptr + i / 2 + src_offset + params_offset));
+                auto v_i32 = _mm512_cvtepu8_epi32(data);
+
+                auto v_512_low_half = _mm512_srli_epi32(v_i32, 4);
+                auto v_f32_low_half = _mm512_cvtepi32_ps(v_512_low_half);
+
+                auto mask = _mm512_set1_epi32(0x0F);
+                auto v_512_high_half = _mm512_and_si512(v_i32, mask);
+                auto v_f32_high_half = _mm512_cvtepi32_ps(v_512_high_half);
+
+                // q - zp
+                v_f32_low_half = _mm512_sub_ps(v_f32_low_half, v_zp);
+                v_f32_high_half = _mm512_sub_ps(v_f32_high_half, v_zp);
+
+                __m512i idx1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
+                __m512i idx2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+                __m512 first_half = _mm512_permutex2var_ps(v_f32_low_half, idx1, v_f32_high_half);
+                __m512 second_half = _mm512_permutex2var_ps(v_f32_low_half, idx2, v_f32_high_half);
+                auto v_out0 = mm512_uni_loadu_ps(out + dst_offset + i);
+                auto v_out1 = mm512_uni_loadu_ps(out + dst_offset + i + vec_len_f32_avx512);
+                v_out0 = _mm512_fmadd_ps(attn_w_vec0, first_half, v_out0);
+                v_out1 = _mm512_fmadd_ps(attn_w_vec0, second_half, v_out1);
+                mm512_uni_storeu_ps(out + dst_offset + i, v_out0);
+                mm512_uni_storeu_ps(out + dst_offset + i + vec_len_f32_avx512, v_out1);
+            }
+#    elif defined(HAVE_AVX2)
+            auto v256_attn_w_vec0 = _mm256_set1_ps(weight[j] * v0[0]);
+            auto v256_zp = _mm256_set1_ps(v0[1]);
+            for (; i + vec_len_f32_avx2 * 2 <= group_size; i += vec_len_f32_avx2 * 2) {
+                auto data = _mm_loadl_epi64(reinterpret_cast<__m128i*>(v_ptr + i / 2 + src_offset + params_offset));
+
+                auto v_i32 = _mm256_cvtepu8_epi32(data);
+                auto v_256_low_half = _mm256_srli_epi32(v_i32, 4);
+                auto v_f32_low_half = _mm256_cvtepi32_ps(v_256_low_half);
+
+                auto mask = _mm256_set1_epi32(0x0F);
+                auto v_256_high_half = _mm256_and_si256(v_i32, mask);
+                auto v_f32_high_half = _mm256_cvtepi32_ps(v_256_high_half);
+                // q - zp
+                v_f32_low_half = _mm256_sub_ps(v_f32_low_half, v256_zp);
+                v_f32_high_half = _mm256_sub_ps(v_f32_high_half, v256_zp);
+
+                auto v_out0 = mm256_uni_loadu_ps(out + dst_offset + i);
+                auto v_out1 = mm256_uni_loadu_ps(out + dst_offset + i + vec_len_f32_avx2);
+
+                __m256 first_half = _mm256_permute2f128_ps(v_f32_low_half, v_f32_high_half, 0x20);
+                auto idx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+                first_half = _mm256_permutevar8x32_ps(first_half, idx1);
+                __m256 second_half = _mm256_permute2f128_ps(v_f32_low_half, v_f32_high_half, 0x31);
+                second_half = _mm256_permutevar8x32_ps(second_half, idx1);
+
+                v_out0 = _mm256_fmadd_ps(v256_attn_w_vec0, first_half, v_out0);
+                v_out1 = _mm256_fmadd_ps(v256_attn_w_vec0, second_half, v_out1);
+                mm256_uni_storeu_ps(out + dst_offset + i, v_out0);
+                mm256_uni_storeu_ps(out + dst_offset + i + vec_len_f32_avx2, v_out1);
+            }
+#    endif
+            for (; i < group_size; i += 2) {
+                uint8_t data = v_ptr[i / 2 + src_offset + params_offset];
+                float tmp0 = extract_half_byte(data, static_cast<bool>(i % 2));
+                float tmp1 = extract_half_byte(data, static_cast<bool>((i + 1) % 2));
+                out[dst_offset + i] += weight[j] * (tmp0 - v0[1]) * v0[0];
+                out[dst_offset + i + 1] += weight[j] * (tmp1 - v0[1]) * v0[0];
+            }
+            dst_offset += group_size;
+            src_offset += group_size / sub_byte_multiplier + params_offset;
+        }
+        v_ptr += src_stride;
     }
 }
 
 template <typename TA, typename TB>
-static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_size) {
+static void dot_product_block(TA* a,
+                              TB* b,
+                              float* c,
+                              const size_t n,
+                              const size_t block_size,
+                              const size_t group_size) {
 #    if defined(HAVE_AVX512F)
     size_t j = 0;
     for (; j + 4 <= block_size; j += 4) {
@@ -422,175 +581,235 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz
 }
 
 template <typename TA>
-static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t block_size) {
+static void dot_product_block(TA* a,
+                              uint8_t* b,
+                              float* c,
+                              const size_t n,
+                              const size_t block_size,
+                              const size_t group_size) {
     // The layout for per token per head:
     // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized
     // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float)
+    size_t src_offset = 0;
+    size_t dst_offset = 0;
+    const size_t params_offset = sizeof(float) * 2;
+    const size_t src_stride = n / group_size * (group_size + params_offset);
 #    if defined(HAVE_AVX512F)
     size_t j = 0;
     for (; j + 4 <= block_size; j += 4) {
-        auto vsum0 = _mm512_setzero_ps();
-        auto vsum1 = _mm512_setzero_ps();
-        auto vsum2 = _mm512_setzero_ps();
-        auto vsum3 = _mm512_setzero_ps();
-        auto b0 = reinterpret_cast<float*>(b);
-        auto b1 = reinterpret_cast<float*>(b + n + 8);
-        auto b2 = reinterpret_cast<float*>(b + (n + 8) * 2);
-        auto b3 = reinterpret_cast<float*>(b + (n + 8) * 3);
-        auto v_zp0 = _mm512_set1_ps(b0[1]);
-        auto v_zp1 = _mm512_set1_ps(b1[1]);
-        auto v_zp2 = _mm512_set1_ps(b2[1]);
-        auto v_zp3 = _mm512_set1_ps(b3[1]);
-        size_t i = 0;
-        b += 8;
-        for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) {
-            auto va = mm512_uni_loadu_ps(a + i);
-            auto vb0 = _mm512_sub_ps(
-                _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))),
-                v_zp0);
-            auto vb1 = _mm512_sub_ps(
-                _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + n + 8)))),
-                v_zp1);
-            auto vb2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
-                                         _mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))),
-                                     v_zp2);
-            auto vb3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
-                                         _mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))),
-                                     v_zp3);
-
-            vsum0 = _mm512_fmadd_ps(va, vb0, vsum0);
-            vsum1 = _mm512_fmadd_ps(va, vb1, vsum1);
-            vsum2 = _mm512_fmadd_ps(va, vb2, vsum2);
-            vsum3 = _mm512_fmadd_ps(va, vb3, vsum3);
+        src_offset = 0;
+        dst_offset = 0;
+        float sum0 = 0.0f;
+        float sum1 = 0.0f;
+        float sum2 = 0.0f;
+        float sum3 = 0.0f;
+        while (dst_offset < n) {
+            auto vsum0 = _mm512_setzero_ps();
+            auto vsum1 = _mm512_setzero_ps();
+            auto vsum2 = _mm512_setzero_ps();
+            auto vsum3 = _mm512_setzero_ps();
+            auto b0 = reinterpret_cast<float*>(b + src_offset);
+            auto b1 = reinterpret_cast<float*>(b + src_offset + src_stride);
+            auto b2 = reinterpret_cast<float*>(b + src_offset + src_stride * 2);
+            auto b3 = reinterpret_cast<float*>(b + src_offset + src_stride * 3);
+            auto v_zp0 = _mm512_set1_ps(b0[1]);
+            auto v_zp1 = _mm512_set1_ps(b1[1]);
+            auto v_zp2 = _mm512_set1_ps(b2[1]);
+            auto v_zp3 = _mm512_set1_ps(b3[1]);
+            size_t i = 0;
+            uint8_t* b_data_ptr = b + src_offset + params_offset;
+            for (; i + vec_len_f32_avx512 <= group_size; i += vec_len_f32_avx512) {
+                auto va = mm512_uni_loadu_ps(a + dst_offset + i);
+                auto vb0 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
+                                             _mm_loadu_si128(reinterpret_cast<__m128i*>(b_data_ptr + i)))),
+                                         v_zp0);
+                auto vb1 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
+                                             _mm_loadu_si128(reinterpret_cast<__m128i*>(b_data_ptr + i + src_stride)))),
+                                         v_zp1);
+                auto vb2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(
+                                             reinterpret_cast<__m128i*>(b_data_ptr + i + 2 * src_stride)))),
+                                         v_zp2);
+                auto vb3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(
+                                             reinterpret_cast<__m128i*>(b_data_ptr + i + 3 * src_stride)))),
+                                         v_zp3);
+
+                vsum0 = _mm512_fmadd_ps(va, vb0, vsum0);
+                vsum1 = _mm512_fmadd_ps(va, vb1, vsum1);
+                vsum2 = _mm512_fmadd_ps(va, vb2, vsum2);
+                vsum3 = _mm512_fmadd_ps(va, vb3, vsum3);
+            }
+            float group_sum0 = _mm512_reduce_add_ps(vsum0);
+            float group_sum1 = _mm512_reduce_add_ps(vsum1);
+            float group_sum2 = _mm512_reduce_add_ps(vsum2);
+            float group_sum3 = _mm512_reduce_add_ps(vsum3);
+            for (; i < group_size; i++) {
+                group_sum0 += a[i + dst_offset] * (b_data_ptr[i] - b0[1]);
+                group_sum1 += a[i + dst_offset] * (b_data_ptr[i + src_stride] - b1[1]);
+                group_sum2 += a[i + dst_offset] * (b_data_ptr[i + 2 * src_stride] - b2[1]);
+                group_sum3 += a[i + dst_offset] * (b_data_ptr[i + 3 * src_stride] - b3[1]);
+            }
+            sum0 += group_sum0 * b0[0];
+            sum1 += group_sum1 * b1[0];
+            sum2 += group_sum2 * b2[0];
+            sum3 += group_sum3 * b3[0];
+            dst_offset += group_size;
+            src_offset += group_size + params_offset;
         }
-        float sum0 = _mm512_reduce_add_ps(vsum0);
-        float sum1 = _mm512_reduce_add_ps(vsum1);
-        float sum2 = _mm512_reduce_add_ps(vsum2);
-        float sum3 = _mm512_reduce_add_ps(vsum3);
-        for (; i < n; i++) {
-            sum0 += a[i] * (b[i] - b0[1]);
-            sum1 += a[i] * (b[i + n + 8] - b1[1]);
-            sum2 += a[i] * (b[i + 2 * (n + 8)] - b2[1]);
-            sum3 += a[i] * (b[i + 3 * (n + 8)] - b3[1]);
-        }
-        c[0] = sum0 * b0[0];
-        c[1] = sum1 * b1[0];
-        c[2] = sum2 * b2[0];
-        c[3] = sum3 * b3[0];
+        c[0] = sum0;
+        c[1] = sum1;
+        c[2] = sum2;
+        c[3] = sum3;
         c += 4;
-        b += 4 * (n + 8) - 8;
+        b += 4 * src_stride;
     }
     for (; j < block_size; j++) {
-        auto vsum = _mm512_setzero_ps();
-        auto b0 = reinterpret_cast<float*>(b);
-        auto v_zp = _mm512_set1_ps(b0[1]);
-        size_t i = 0;
-        b += 8;
-        for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) {
-            auto va = mm512_uni_loadu_ps(a + i);
-            auto vb = _mm512_sub_ps(
-                _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))),
-                v_zp);
-            vsum = _mm512_fmadd_ps(va, vb, vsum);
-        }
-        float sum = _mm512_reduce_add_ps(vsum);
-        for (; i < n; i++) {
-            sum += a[i] * (b[i] - b0[1]);
+        src_offset = 0;
+        dst_offset = 0;
+        float sum = 0;
+        while (dst_offset < n) {
+            auto vsum = _mm512_setzero_ps();
+            auto b0 = reinterpret_cast<float*>(b + src_offset);
+            auto v_zp = _mm512_set1_ps(b0[1]);
+            size_t i = 0;
+            uint8_t* b_data_ptr = b + src_offset + params_offset;
+            for (; i + vec_len_f32_avx512 <= group_size; i += vec_len_f32_avx512) {
+                auto va = mm512_uni_loadu_ps(a + dst_offset + i);
+                auto vb = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(
+                                            _mm_loadu_si128(reinterpret_cast<__m128i*>(b_data_ptr + i)))),
+                                        v_zp);
+                vsum = _mm512_fmadd_ps(va, vb, vsum);
+            }
+            float group_sum = _mm512_reduce_add_ps(vsum);
+            for (; i < group_size; i++) {
+                group_sum += a[i + dst_offset] * (b_data_ptr[i] - b0[1]);
+            }
+            sum += group_sum * b0[0];
+            dst_offset += group_size;
+            src_offset += group_size + params_offset;
         }
-        b += n;
-        *c++ = sum * b0[0];
+        b += src_stride;
+        *c++ = sum;
     }
     return;
 #    elif defined(HAVE_AVX2)
     size_t j = 0;
     for (; j + 4 <= block_size; j += 4) {
-        auto vsum0 = _mm256_setzero_ps();
-        auto vsum1 = _mm256_setzero_ps();
-        auto vsum2 = _mm256_setzero_ps();
-        auto vsum3 = _mm256_setzero_ps();
-        auto b0 = reinterpret_cast<float*>(b);
-        auto b1 = reinterpret_cast<float*>(b + n + 8);
-        auto b2 = reinterpret_cast<float*>(b + (n + 8) * 2);
-        auto b3 = reinterpret_cast<float*>(b + (n + 8) * 3);
-        auto v_zp0 = _mm256_set1_ps(b0[1]);
-        auto v_zp1 = _mm256_set1_ps(b1[1]);
-        auto v_zp2 = _mm256_set1_ps(b2[1]);
-        auto v_zp3 = _mm256_set1_ps(b3[1]);
-        size_t i = 0;
-        b += 8;
-        for (; i + vec_len_f32_avx2 <= n; i += vec_len_f32_avx2) {
-            auto va = mm256_uni_loadu_ps(a + i);
-            auto vb0 = _mm256_sub_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))),
-                v_zp0);
-            auto vb1 = _mm256_sub_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + n + 8)))),
-                v_zp1);
-            auto vb2 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                                         _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))),
-                                     v_zp2);
-            auto vb3 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                                         _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))),
-                                     v_zp3);
-
-            vsum0 = _mm256_fmadd_ps(va, vb0, vsum0);
-            vsum1 = _mm256_fmadd_ps(va, vb1, vsum1);
-            vsum2 = _mm256_fmadd_ps(va, vb2, vsum2);
-            vsum3 = _mm256_fmadd_ps(va, vb3, vsum3);
+        src_offset = 0;
+        dst_offset = 0;
+        float sum0 = 0.0f;
+        float sum1 = 0.0f;
+        float sum2 = 0.0f;
+        float sum3 = 0.0f;
+        while (dst_offset < n) {
+            auto vsum0 = _mm256_setzero_ps();
+            auto vsum1 = _mm256_setzero_ps();
+            auto vsum2 = _mm256_setzero_ps();
+            auto vsum3 = _mm256_setzero_ps();
+            auto b0 = reinterpret_cast<float*>(b + src_offset);
+            auto b1 = reinterpret_cast<float*>(b + src_offset + src_stride);
+            auto b2 = reinterpret_cast<float*>(b + src_offset + src_stride * 2);
+            auto b3 = reinterpret_cast<float*>(b + src_offset + src_stride * 3);
+            auto v_zp0 = _mm256_set1_ps(b0[1]);
+            auto v_zp1 = _mm256_set1_ps(b1[1]);
+            auto v_zp2 = _mm256_set1_ps(b2[1]);
+            auto v_zp3 = _mm256_set1_ps(b3[1]);
+            size_t i = 0;
+            uint8_t* b_data_ptr = b + src_offset + params_offset;
+            for (; i + vec_len_f32_avx2 <= group_size; i += vec_len_f32_avx2) {
+                auto va = mm256_uni_loadu_ps(a + dst_offset + i);
+                auto vb0 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                                             _mm_loadl_epi64(reinterpret_cast<__m128i*>(b_data_ptr + i)))),
+                                         v_zp0);
+                auto vb1 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                                             _mm_loadl_epi64(reinterpret_cast<__m128i*>(b_data_ptr + i + src_stride)))),
+                                         v_zp1);
+                auto vb2 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(
+                                             reinterpret_cast<__m128i*>(b_data_ptr + i + 2 * src_stride)))),
+                                         v_zp2);
+                auto vb3 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(
+                                             reinterpret_cast<__m128i*>(b_data_ptr + i + 3 * src_stride)))),
+                                         v_zp3);
+
+                vsum0 = _mm256_fmadd_ps(va, vb0, vsum0);
+                vsum1 = _mm256_fmadd_ps(va, vb1, vsum1);
+                vsum2 = _mm256_fmadd_ps(va, vb2, vsum2);
+                vsum3 = _mm256_fmadd_ps(va, vb3, vsum3);
+            }
+            hsum(vsum0);
+            hsum(vsum1);
+            hsum(vsum2);
+            hsum(vsum3);
+            float group_sum0 = _mm256_cvtss_f32(vsum0);
+            float group_sum1 = _mm256_cvtss_f32(vsum1);
+            float group_sum2 = _mm256_cvtss_f32(vsum2);
+            float group_sum3 = _mm256_cvtss_f32(vsum3);
+            for (; i < group_size; i++) {
+                group_sum0 += a[dst_offset + i] * (b[i] - b0[1]);
+                group_sum1 += a[dst_offset + i] * (b[i + src_stride] - b1[1]);
+                group_sum2 += a[dst_offset + i] * (b[i + 2 * src_stride] - b2[1]);
+                group_sum3 += a[dst_offset + i] * (b[i + 3 * src_stride] - b3[1]);
+            }
+            sum0 += group_sum0 * b0[0];
+            sum1 += group_sum1 * b1[0];
+            sum2 += group_sum2 * b2[0];
+            sum3 += group_sum3 * b3[0];
+            dst_offset += group_size;
+            src_offset += group_size + params_offset;
         }
-        hsum(vsum0);
-        hsum(vsum1);
-        hsum(vsum2);
-        hsum(vsum3);
-        float sum0 = _mm256_cvtss_f32(vsum0);
-        float sum1 = _mm256_cvtss_f32(vsum1);
-        float sum2 = _mm256_cvtss_f32(vsum2);
-        float sum3 = _mm256_cvtss_f32(vsum3);
-        for (; i < n; i++) {
-            sum0 += a[i] * (b[i] - b0[1]);
-            sum1 += a[i] * (b[i + n + 8] - b1[1]);
-            sum2 += a[i] * (b[i + 2 * (n + 8)] - b2[1]);
-            sum3 += a[i] * (b[i + 3 * (n + 8)] - b3[1]);
-        }
-        c[0] = sum0 * b0[0];
-        c[1] = sum1 * b1[0];
-        c[2] = sum2 * b2[0];
-        c[3] = sum3 * b3[0];
+        c[0] = sum0;
+        c[1] = sum1;
+        c[2] = sum2;
+        c[3] = sum3;
         c += 4;
-        b += 4 * (n + 8) - 8;
+        b += 4 * src_stride;
     }
     for (; j < block_size; j++) {
-        auto vsum = _mm256_setzero_ps();
-        auto b0 = reinterpret_cast<float*>(b);
-        auto v_zp = _mm256_set1_ps(b0[1]);
-        size_t i = 0;
-        b += 8;
-        for (; i + vec_len_f32_avx2 <= n; i += vec_len_f32_avx2) {
-            auto va = mm256_uni_loadu_ps(a + i);
-            auto vb = _mm256_sub_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))),
-                v_zp);
-            vsum = _mm256_fmadd_ps(va, vb, vsum);
-        }
-        hsum(vsum);
-        float sum = _mm256_cvtss_f32(vsum);
-        for (; i < n; i++) {
-            sum += a[i] * (b[i] - b0[1]);
+        src_offset = 0;
+        dst_offset = 0;
+        float sum = 0;
+        while (dst_offset < n) {
+            auto vsum = _mm256_setzero_ps();
+            auto b0 = reinterpret_cast<float*>(b + src_offset);
+            auto v_zp = _mm256_set1_ps(b0[1]);
+            size_t i = 0;
+            uint8_t* b_data_ptr = b + src_offset + params_offset;
+            for (; i + vec_len_f32_avx2 <= group_size; i += vec_len_f32_avx2) {
+                auto va = mm256_uni_loadu_ps(a + dst_offset + i);
+                auto vb = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                                            _mm_loadl_epi64(reinterpret_cast<__m128i*>(b_data_ptr + i)))),
+                                        v_zp);
+                vsum = _mm256_fmadd_ps(va, vb, vsum);
+            }
+            hsum(vsum);
+            float group_sum = _mm256_cvtss_f32(vsum);
+            for (; i < group_size; i++) {
+                group_sum += a[i + dst_offset] * (b_data_ptr[i] - b0[1]);
+            }
+            sum += group_sum * b0[0];
+            dst_offset += group_size;
+            src_offset += group_size + params_offset;
         }
-        b += n;
-        *c++ = sum * b0[0];
+        b += src_stride;
+        *c++ = sum;
     }
     return;
 #    endif
     for (size_t j = 0; j < block_size; j++) {
         float sum = 0;
-        auto b0 = reinterpret_cast<float*>(b);
-        b += 8;
-        for (size_t i = 0; i < n; i++) {
-            sum += a[i] * (b[i] - b0[1]);
+        dst_offset = 0;
+        src_offset = 0;
+        while (dst_offset < n) {
+            auto b0 = reinterpret_cast<float*>(b + src_offset);
+            float group_sum = 0.0f;
+            for (size_t i = 0; i < group_size; i++) {
+                group_sum += a[dst_offset + i] * (b[src_offset + params_offset + i] - b0[1]);
+            }
+            sum += group_sum * b0[0];
+            dst_offset += group_size;
+            src_offset += group_size + params_offset;
         }
-        b += n;
-        *c++ = sum * b0[0];
+        b += src_stride;
+        *c++ = sum;
     }
 }
 
@@ -634,73 +853,138 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str
 }
 
 // N must be multiple of 16
-template <typename TDST, typename TSRC>
-void transpose_16NxK(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+template <typename TDST,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<(SRC_PREC != ov::element::u8 && SRC_PREC != ov::element::u4), bool>::type = true>
+void transpose_16NxK(TDST* dst,
+                     void* src,
+                     TDST* tmp,
+                     const size_t N,
+                     const size_t K,
+                     const size_t dst_stride,
+                     const size_t src_stride,
+                     const size_t group_size) {
     size_t k = 0;
+    auto* src_ptr = reinterpret_cast<typename ov::element_type_traits<SRC_PREC>::value_type*>(src);
     for (; k + 16 <= K; k += 16) {
         for (size_t n = 0; n < N; n += 16) {
-            transpose_16x16_kernel(dst + n, src + n * src_stride, dst_stride, src_stride);
+            transpose_16x16_kernel(dst + n, src_ptr + n * src_stride, dst_stride, src_stride);
         }
 
         dst += 16 * dst_stride;
-        src += 16;
+        src_ptr += 16;
     }
     if (k < K) {
         for (size_t n = 0; n < N; n += 16) {
-            transpose_16xK_kernel(dst + n, src + n * src_stride, K - k, dst_stride, src_stride);
+            transpose_16xK_kernel(dst + n, src_ptr + n * src_stride, K - k, dst_stride, src_stride);
         }
     }
 }
-
 #    if defined(HAVE_AVX512F)
 template <typename T,
-          typename = typename std::
-              enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
-static void transpose_16NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<(SRC_PREC == ov::element::bf16 || SRC_PREC == ov::element::f16) &&
+                                      (SRC_PREC == precision_of<T>::value),
+                                  bool>::type = true>
+static void transpose_16NxK(T* dst,
+                            T* src,
+                            T* tmp,
+                            const size_t N,
+                            const size_t K,
+                            const size_t dst_stride,
+                            const size_t src_stride,
+                            const size_t group_size) {
     // will treat as uint32_t transpose
     auto s = reinterpret_cast<uint32_t*>(src);
     auto d = reinterpret_cast<uint32_t*>(dst);
-    transpose_16NxK(d, s, reinterpret_cast<uint32_t*>(0), N, K >> 1, dst_stride, src_stride >> 1);
+    transpose_16NxK<uint32_t, ov::element::u32>(d,
+                                                s,
+                                                reinterpret_cast<uint32_t*>(0),
+                                                N,
+                                                K >> 1,
+                                                dst_stride,
+                                                src_stride >> 1,
+                                                group_size);
 }
 #    endif
 
-template <typename TDST>
-void transpose_16NxK(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+template <typename TDST,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<SRC_PREC == ov::element::u8, bool>::type = true>
+void transpose_16NxK(TDST* dst,
+                     void* src,
+                     TDST* tmp,
+                     const size_t N,
+                     const size_t K,
+                     const size_t dst_stride,
+                     const size_t src_stride,
+                     const size_t group_size) {
     // The layout for per token per head:
     // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized
     // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float)
-    auto s = src;
+    auto s = reinterpret_cast<typename ov::element_type_traits<SRC_PREC>::value_type*>(src);
     auto t = tmp;
+    // if group_size not set, the whole row is used as a group
     for (size_t n = 0; n < N; n++) {
-        auto f = reinterpret_cast<float*>(s);
-        attn_dequant_u8_kernel(s + 2 * sizeof(float), t, K, f[0], f[1]);
-        s += src_stride + 2 * sizeof(float);
+        size_t src_offset = 0;
+        size_t dst_offset = 0;
+        while (dst_offset < K) {
+            auto f = reinterpret_cast<float*>(s + src_offset);
+            attn_dequant_kernel<TDST, SRC_PREC>(s + src_offset + sizeof(float) * 2,
+                                                t + dst_offset,
+                                                group_size,
+                                                f[0],
+                                                f[1]);
+            src_offset += group_size + sizeof(float) * 2;
+            dst_offset += group_size;
+        }
+        s += src_offset;
         t += src_stride;
     }
-    transpose_16NxK(dst, tmp, reinterpret_cast<TDST*>(0), N, K, dst_stride, src_stride);
+    transpose_16NxK<TDST,
+                    precision_of<TDST>::value>(dst, tmp, reinterpret_cast<TDST*>(0), N, K, dst_stride, src_stride, 0);
 }
 
 // dequant f16/u8 to float
-template <typename T>
-static inline void dequant(T* dst, T* src, size_t N, size_t K) {
+template <typename T,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<SRC_PREC != ov::element::u8, bool>::type = true>
+static inline void dequant(T* dst, void* src, const size_t N, const size_t K, const size_t group_size) {
     // never called
     OPENVINO_THROW("dequant: should not be called.");
 }
-
-static inline void dequant(float* dst, ov::float16* src, size_t N, size_t K) {
+template <typename T,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<SRC_PREC == ov::element::f16, bool>::type = true>
+static inline void dequant(float* dst, ov::float16* src, const size_t N, const size_t K, const size_t group_size) {
     cvt_copy(dst, src, K * N);
 }
 
-template <typename TDST>
-void dequant(TDST* dst, uint8_t* src, size_t N, size_t K) {
+template <typename TDST,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<SRC_PREC == ov::element::u4 || SRC_PREC == ov::element::u8, bool>::type = true>
+void dequant(TDST* dst, uint8_t* src, const size_t N, const size_t K, const size_t group_size) {
     // The layout for per token per head:
     // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized
     // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float)
     auto s = src;
+    const size_t params_offset = sizeof(float) * 2;
+    const size_t sub_byte_mulitplier = get_sub_byte_multiplier(SRC_PREC);
+
     for (size_t n = 0; n < N; n++) {
-        auto f = reinterpret_cast<float*>(s);
-        attn_dequant_u8_kernel(s + 2 * sizeof(float), dst, K, f[0], f[1]);
-        s += K + 2 * sizeof(float);
+        size_t src_offset = 0;
+        size_t dst_offset = 0;
+        while (dst_offset < K) {
+            auto f = reinterpret_cast<float*>(s + src_offset);
+            attn_dequant_kernel<TDST, SRC_PREC>(s + src_offset + params_offset,
+                                                dst + dst_offset,
+                                                group_size,
+                                                f[0],
+                                                f[1]);
+            src_offset += group_size / sub_byte_mulitplier + params_offset;
+            dst_offset += group_size;
+        }
+        s += src_offset;
         dst += K;
     }
 }
@@ -772,54 +1056,101 @@ static void pack_32xK_kernel(T* dst, T* src, size_t dst_stride, size_t src_strid
     }
 }
 
-template <typename T,
-          typename = typename std::
-              enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
-static void pack_32NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+template <typename TDST,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<precision_of<TDST>::value != ov::element::f32 &&
+                                      (SRC_PREC == ov::element::bf16 || SRC_PREC == ov::element::f16),
+                                  bool>::type = true>
+static void pack_32NxK(TDST* dst,
+                       void* src,
+                       TDST* tmp,
+                       const size_t N,
+                       const size_t K,
+                       const size_t dst_stride,
+                       const size_t src_stride,
+                       const size_t group_size) {
+    auto src_ptr = reinterpret_cast<typename ov::element_type_traits<SRC_PREC>::value_type*>(src);
     for (size_t n = 0; n < N; n += 32) {
         size_t k = 0;
         for (; k + 32 <= K; k += 32) {
-            pack_32x32_kernel(dst + k * 2, src + k, dst_stride, src_stride);
+            pack_32x32_kernel(dst + k * 2, src_ptr + k, dst_stride, src_stride);
         }
         if (k + 16 <= K) {
-            pack_32x16_kernel(dst + k * 2, src + k, dst_stride, src_stride);
+            pack_32x16_kernel(dst + k * 2, src_ptr + k, dst_stride, src_stride);
             k += 16;
         }
         if (k < K) {
-            pack_32xK_kernel(dst + k * 2, src + k, dst_stride, src_stride, K - k);
+            pack_32xK_kernel(dst + k * 2, src_ptr + k, dst_stride, src_stride, K - k);
         }
 
         dst += 32 * dst_stride;
-        src += 32 * src_stride;
+        src_ptr += 32 * src_stride;
     }
 }
 
-template <typename T,
-          typename = typename std::
-              enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
-static void pack_32NxK(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+template <typename TDST,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<precision_of<TDST>::value != ov::element::f32 &&
+                                      (SRC_PREC == ov::element::u4 || SRC_PREC == ov::element::u8),
+                                  bool>::type = true>
+static void pack_32NxK(TDST* dst,
+                       void* src,
+                       TDST* tmp,
+                       const size_t N,
+                       const size_t K,
+                       const size_t dst_stride,
+                       const size_t src_stride,
+                       const size_t group_size) {
     // The layout for per token per head:
     // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized
     // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float)
-    auto s = src;
+    auto s = reinterpret_cast<uint8_t*>(src);
     auto t = tmp;
+    // if group_size not set, the whole row is used as a group
+    const size_t sub_byte_mulitplier = get_sub_byte_multiplier(SRC_PREC);
     for (size_t n = 0; n < N; n++) {
-        auto f = reinterpret_cast<float*>(s);
-        attn_dequant_u8_kernel(s + 2 * sizeof(float), t, K, f[0], f[1]);
-        s += src_stride + 2 * sizeof(float);
+        size_t src_offset = 0;
+        size_t dst_offset = 0;
+        while (dst_offset < K) {
+            auto f = reinterpret_cast<float*>(s + src_offset);
+            attn_dequant_kernel<TDST, SRC_PREC>(s + (src_offset + sizeof(float) * 2),
+                                                t + dst_offset,
+                                                group_size,
+                                                f[0],
+                                                f[1]);
+            src_offset += group_size / sub_byte_mulitplier + sizeof(float) * 2;
+            dst_offset += group_size;
+        }
+        s += src_offset;
         t += src_stride;
     }
-    pack_32NxK(dst, tmp, reinterpret_cast<T*>(0), N, K, dst_stride, src_stride);
+    pack_32NxK<TDST, precision_of<TDST>::value>(dst,
+                                                tmp,
+                                                reinterpret_cast<TDST*>(0),
+                                                N,
+                                                K,
+                                                dst_stride,
+                                                src_stride,
+                                                group_size);
 }
 #    endif
 
-template <typename T>
-static void pack_32NxK(float* dst, T* src, float* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+template <typename TDST,
+          ov::element::Type_t SRC_PREC,
+          typename std::enable_if<precision_of<TDST>::value == ov::element::f32, bool>::type = true>
+static void pack_32NxK(TDST* dst,
+                       void* src,
+                       TDST* tmp,
+                       const size_t N,
+                       const size_t K,
+                       const size_t dst_stride,
+                       const size_t src_stride,
+                       const size_t group_size) {
     // never called
     OPENVINO_THROW("pack_32NxK: should not be called.");
 }
 
-template <typename DATA_TYPE, typename KVCACHE_TYPE>
+template <typename DATA_TYPE, typename KEY_CACHE_TYPE, ov::element::Type_t VALUE_PREC>
 struct MHAHelper {
     // initialize once
     size_t _H;
@@ -831,6 +1162,8 @@ struct MHAHelper {
     size_t _nthr;
     size_t _sliding_window;
     float _d_scale;
+    size_t _key_group_size = 0;
+    size_t _value_group_size = 0;
 
     PlainTensor _weight;        // [nthr, H, 32, rnd_up(kv_len, block_size)], shared by first and second loop along bh
     PlainTensor _output;        // [nthr, 32, H, S], shared by first and second loop along bh
@@ -860,6 +1193,12 @@ struct MHAHelper {
         _weight.resize<float>({size_t{1}, size_t{1}, size_t{1}, size_t{1}});
     }
 
+    explicit MHAHelper(size_t key_group_size, size_t value_group_size)
+        : _key_group_size(key_group_size),
+          _value_group_size(value_group_size) {
+        _weight.resize<float>({size_t{1}, size_t{1}, size_t{1}, size_t{1}});
+    }
+
     void init(size_t H,
               size_t S,
               size_t SV,
@@ -947,11 +1286,11 @@ struct MHAHelper {
             if ((S % 32 == 0) && (block_size % 16 == 0) && (S <= 32 * 6)) {
                 if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_bf16) &&
                     precision_of<DATA_TYPE>::value == ov::element::bf16 &&
-                    precision_of<KVCACHE_TYPE>::value == ov::element::bf16) {
+                    precision_of<KEY_CACHE_TYPE>::value == ov::element::bf16 && VALUE_PREC == ov::element::bf16) {
                     _fastpath_valid_prec = ov::element::bf16;
                 } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_fp16) &&
                            precision_of<DATA_TYPE>::value == ov::element::f16 &&
-                           precision_of<KVCACHE_TYPE>::value == ov::element::f16) {
+                           precision_of<KEY_CACHE_TYPE>::value == ov::element::f16 && VALUE_PREC == ov::element::f16) {
                     _fastpath_valid_prec = ov::element::f16;
                 }
             }
@@ -1020,7 +1359,7 @@ struct MHAHelper {
         auto q_end = std::min(q_start + _block_size, q_len);
         auto q_cnt = q_end - q_start;
         constexpr bool q_is_xf16 = one_of(precision_of<DATA_TYPE>::value, ov::element::bf16, ov::element::f16);
-        constexpr bool q_cache_is_same = precision_of<DATA_TYPE>::value == precision_of<KVCACHE_TYPE>::value;
+        constexpr bool q_cache_is_same = precision_of<DATA_TYPE>::value == VALUE_PREC;
         auto cur_kv_len_blocks = div_up(cur_kv_len, _block_size);
         for (size_t h = hq_beg; h < hq_end; h++) {
             auto* q_ptr = query.ptr<DATA_TYPE>(h, q_start, 0);
@@ -1164,7 +1503,7 @@ struct MHAHelper {
                 for (size_t pq = 0; pq < q_len; pq++) {
                     for (size_t h = hq_beg; h < hq_end; h++) {
                         (*_gemv)(query.ptr<DATA_TYPE>(h, pq),
-                                 present_key.ptr<KVCACHE_TYPE>(block_number, hk),
+                                 present_key.ptr<KEY_CACHE_TYPE>(block_number, hk),
                                  _weight.ptr<float>(ithr, h, pq) + pk);
                     }
                 }
@@ -1176,10 +1515,11 @@ struct MHAHelper {
                 for (size_t pq = 0; pq < q_len; pq++) {
                     for (size_t h = hq_beg; h < hq_end; h++) {
                         dot_product_block(query.ptr<DATA_TYPE>(h, pq),
-                                          present_key.ptr<KVCACHE_TYPE>(block_number, hk),
+                                          present_key.ptr<KEY_CACHE_TYPE>(block_number, hk),
                                           _weight.ptr<float>(ithr, h, pq) + pk,
                                           _S,
-                                          std::min(_block_size, cur_kv_len - pk));
+                                          std::min(_block_size, cur_kv_len - pk),
+                                          _key_group_size);
                     }
                 }
             }
@@ -1217,14 +1557,20 @@ struct MHAHelper {
         memset(_output.ptr<float>(ithr), 0, q_len * _H * _SV * sizeof(float));
         for (size_t pv = 0, i = 0; pv < cur_kv_len; pv += _block_size, i++) {
             auto block_number = block_table[i];
-            auto* v = present_value.ptr<KVCACHE_TYPE>(block_number, hk);
             for (size_t pq = 0; pq < q_len; pq++) {
                 for (size_t h = hq_beg; h < hq_end; h++) {
-                    attn_acc_value_block(_output.ptr<float>(ithr, pq, h),
-                                         _weight.ptr<float>(ithr, h, pq) + pv,
-                                         v,
-                                         _SV,
-                                         std::min(_block_size, cur_kv_len - pv));
+                    auto sub_byte_multiplier = get_sub_byte_multiplier(present_value.get_precision());
+                    size_t v_stride = (block_number * present_value.m_strides[0] + hk * present_value.m_strides[1]) *
+                                      present_value.get_precision().size() / sub_byte_multiplier;
+                    auto* v_ptr = reinterpret_cast<typename element_type_traits<VALUE_PREC>::value_type*>(
+                        present_value.m_ptr.get() + v_stride);
+                    attn_acc_value_block<typename element_type_traits<VALUE_PREC>::value_type, VALUE_PREC>(
+                        _output.ptr<float>(ithr, pq, h),
+                        _weight.ptr<float>(ithr, h, pq) + pv,
+                        v_ptr,
+                        _SV,
+                        std::min(_block_size, cur_kv_len - pv),
+                        _value_group_size);
                 }
             }
         }
@@ -1301,7 +1647,7 @@ struct MHAHelper {
                     for (size_t pq = 0; pq < q_len; pq++) {
                         for (size_t h = hq_beg; h < hq_end; h++) {
                             (*_gemv)(query.ptr<DATA_TYPE>(b, h, pq),
-                                     present_key.ptr<KVCACHE_TYPE>(block_number, hk),
+                                     present_key.ptr<KEY_CACHE_TYPE>(block_number, hk),
                                      _weight_bhl.ptr<float>(b, h, pq) + pk);
                         }
                     }
@@ -1310,10 +1656,11 @@ struct MHAHelper {
                     for (size_t pq = 0; pq < q_len; pq++) {
                         for (size_t h = hq_beg; h < hq_end; h++) {
                             dot_product_block(query.ptr<DATA_TYPE>(b, h, pq),
-                                              present_key.ptr<KVCACHE_TYPE>(block_number, hk),
+                                              present_key.ptr<KEY_CACHE_TYPE>(block_number, hk),
                                               _weight_bhl.ptr<float>(b, h, pq) + pk,
                                               _S,
-                                              std::min(_block_size, context_len - pk));
+                                              std::min(_block_size, context_len - pk),
+                                              _key_group_size);
                         }
                     }
                 }
@@ -1380,14 +1727,21 @@ struct MHAHelper {
             // kv_len must be valid
             if (pv < context_len) {
                 auto block_number = block_indices.ptr<int32_t>()[block_indices_begins.ptr<int32_t>()[b] + pv_in_blocks];
-                auto* v = present_value.ptr<KVCACHE_TYPE>(block_number, hk);
                 for (size_t pq = 0; pq < q_len; pq++) {
                     for (size_t h = hq_beg; h < hq_end; h++) {
-                        attn_acc_value_block(_output_bhl.ptr<float>(ithr, b, pq, h),
-                                             _weight_bhl.ptr<float>(b, h, pq) + pv,
-                                             v,
-                                             _SV,
-                                             std::min(_block_size, context_len - pv));
+                        auto sub_byte_multiplier = get_sub_byte_multiplier(present_value.get_precision());
+                        size_t v_stride =
+                            (block_number * present_value.m_strides[0] + hk * present_value.m_strides[1]) *
+                            present_value.get_precision().size() / sub_byte_multiplier;
+                        auto* v_ptr = reinterpret_cast<typename element_type_traits<VALUE_PREC>::value_type*>(
+                            present_value.m_ptr.get() + v_stride);
+                        attn_acc_value_block<typename element_type_traits<VALUE_PREC>::value_type, VALUE_PREC>(
+                            _output_bhl.ptr<float>(ithr, b, pq, h),
+                            _weight_bhl.ptr<float>(b, h, pq) + pv,
+                            v_ptr,
+                            _SV,
+                            std::min(_block_size, context_len - pv),
+                            _value_group_size);
                     }
                 }
             }
@@ -1408,9 +1762,9 @@ struct MHAHelper {
     }
 };
 
-template <typename DATA_TYPE, typename KVCACHE_TYPE>
+template <typename DATA_TYPE, typename KEY_CACHE_TYPE, ov::element::Type_t VALUE_PREC>
 struct MHA {
-    MHAHelper<DATA_TYPE, KVCACHE_TYPE>& _helper;
+    MHAHelper<DATA_TYPE, KEY_CACHE_TYPE, VALUE_PREC>& _helper;
     struct AttnWorkItem {
         int32_t batch_in_reorder;  // which batch in reorder buffer will be used
         int32_t batch_in_seq;      // batch idx in sequence
@@ -1510,7 +1864,7 @@ struct MHA {
 
     WorkItems _workitems;
 
-    MHA(MHAHelper<DATA_TYPE, KVCACHE_TYPE>& helper) : _helper(helper) {}
+    MHA(MHAHelper<DATA_TYPE, KEY_CACHE_TYPE, VALUE_PREC>& helper) : _helper(helper) {}
 
     // one loop to handle first and second tokens
     void exec_loop_mixed(const PlainTensor& q,
@@ -1527,7 +1881,7 @@ struct MHA {
         auto Hk = v_cache.m_dims[1];
 
         constexpr bool q_is_xf16 = one_of(precision_of<DATA_TYPE>::value, ov::element::bf16, ov::element::f16);
-        constexpr bool q_cache_is_same = precision_of<DATA_TYPE>::value == precision_of<KVCACHE_TYPE>::value;
+        constexpr bool q_cache_is_same = precision_of<DATA_TYPE>::value == VALUE_PREC;
         auto attn_work_count = _workitems.attn_work_size();
         auto reorder_work_count = _workitems.reorder_work_size();
 
@@ -1547,30 +1901,45 @@ struct MHA {
                 return;
 
             auto ithr = parallel_get_thread_num();
-            auto* k_ptr = k_cache.ptr<KVCACHE_TYPE>(block_number, hk);
-            auto* v_ptr = v_cache.ptr<KVCACHE_TYPE>(block_number, hk);
-            transpose_16NxK(_helper._qk_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
-                            k_ptr,
-                            _helper._output.template ptr<DATA_TYPE>(ithr),
-                            _helper._block_size,
-                            _helper._S,
-                            _helper._block_size,
-                            _helper._S);
+            auto* k_ptr = k_cache.ptr<KEY_CACHE_TYPE>(block_number, hk);
+
+            transpose_16NxK<DATA_TYPE, precision_of<KEY_CACHE_TYPE>::value>(
+                _helper._qk_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
+                k_ptr,
+                _helper._output.template ptr<DATA_TYPE>(ithr),
+                _helper._block_size,
+                _helper._S,
+                _helper._block_size,
+                _helper._S,
+                _helper._key_group_size);
+
             if (q_is_xf16) {
-                pack_32NxK(_helper._wv_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
-                           v_ptr,
-                           _helper._output.template ptr<DATA_TYPE>(ithr),
-                           _helper._block_size,
-                           _helper._SV,
-                           rnd_up(_helper._SV, _helper._block_size),
-                           _helper._SV);
+                auto sub_byte_multiplier = get_sub_byte_multiplier(v_cache.get_precision());
+                size_t v_stride = (block_number * v_cache.m_strides[0] + hk * v_cache.m_strides[1]) *
+                                  v_cache.get_precision().size() / sub_byte_multiplier;
+                auto* v_ptr = v_cache.m_ptr.get() + v_stride;
+                pack_32NxK<DATA_TYPE, VALUE_PREC>(
+                    _helper._wv_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
+                    v_ptr,
+                    _helper._output.template ptr<DATA_TYPE>(ithr),
+                    _helper._block_size,
+                    _helper._SV,
+                    rnd_up(_helper._SV, _helper._block_size),
+                    _helper._SV,
+                    _helper._value_group_size);
             } else {
                 // need to decompress
                 if (!q_cache_is_same) {
-                    dequant(_helper._wv_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
-                            v_ptr,
-                            _helper._block_size,
-                            _helper._SV);
+                    auto sub_byte_multiplier = get_sub_byte_multiplier(v_cache.get_precision());
+                    size_t v_stride = (block_number * v_cache.m_strides[0] + hk * v_cache.m_strides[1]) *
+                                      v_cache.get_precision().size() / sub_byte_multiplier;
+                    auto* v_ptr = v_cache.m_ptr.get() + v_stride;
+                    dequant<DATA_TYPE, VALUE_PREC>(
+                        _helper._wv_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
+                        v_ptr,
+                        _helper._block_size,
+                        _helper._SV,
+                        _helper._value_group_size);
                 }
             }
         });
@@ -1721,14 +2090,18 @@ struct MHA {
     }
 };
 
-template <typename DATA_TYPE, typename KVCACHE_TYPE>
+template <typename DATA_TYPE, typename KEY_CACHE_TYPE, ov::element::Type_t VALUE_PREC>
 struct AttentionExecutor : public PagedAttentionExecutor {
-    MHAHelper<DATA_TYPE, KVCACHE_TYPE> _helper;
-    MHA<DATA_TYPE, KVCACHE_TYPE> _kernel;
+    MHAHelper<DATA_TYPE, KEY_CACHE_TYPE, VALUE_PREC> _helper;
+    MHA<DATA_TYPE, KEY_CACHE_TYPE, VALUE_PREC> _kernel;
     PlainTensor _slot_mapping;
 
     AttentionExecutor() : _kernel(_helper) {}
 
+    explicit AttentionExecutor(size_t key_group_size, size_t value_group_size)
+        : _helper(MHAHelper<DATA_TYPE, KEY_CACHE_TYPE, VALUE_PREC>(key_group_size, value_group_size)),
+          _kernel(_helper) {}
+
     void init(const std::vector<MemoryPtr>& inputs,
               const std::vector<MemoryPtr>& outputs,
               PlainTensor& q,
@@ -1769,8 +2142,22 @@ struct AttentionExecutor : public PagedAttentionExecutor {
         // The layout for per token per head for u8 kv cache:
         // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized
         // feature(u8,idx_S)| The actual size needs to deduct scale and zeropoint.
-        auto S = k_cache.size(3) - (k_cache.m_dt == ov::element::Type_t::u8 ? sizeof(float) * 2 : 0);
-        auto SV = v_cache.size(3) - (k_cache.m_dt == ov::element::Type_t::u8 ? sizeof(float) * 2 : 0);
+        const size_t key_sub_byte_multiplier = get_sub_byte_multiplier(k_cache.get_precision());
+        const size_t value_sub_byte_multiplier = get_sub_byte_multiplier(v_cache.get_precision());
+        const size_t key_params_size = sizeof(float) * 2 * key_sub_byte_multiplier;
+        // u4 needs scale + zp. s4 needs scale.
+        const size_t param_size =
+            one_of(v_cache.get_precision(), ov::element::u4, ov::element::u8) ? sizeof(float) * 2 : sizeof(float);
+        const size_t value_params_size = param_size * value_sub_byte_multiplier;
+        size_t key_group_num =
+            _helper._key_group_size ? k_cache.size(3) / (_helper._key_group_size + key_params_size) : 1;
+        size_t value_group_num =
+            _helper._value_group_size ? v_cache.size(3) / (_helper._value_group_size + value_params_size) : 1;
+        auto S = k_cache.size(3) - (k_cache.get_precision().is_real() ? 0 : key_params_size * key_group_num);
+        auto SV = v_cache.size(3) - (v_cache.get_precision().is_real() ? 0 : value_params_size * value_group_num);
+        // revise group_size if it's zero.
+        _helper._key_group_size = _helper._key_group_size ? _helper._key_group_size : S;
+        _helper._value_group_size = _helper._value_group_size ? _helper._value_group_size : SV;
         auto block_size = k_cache.size(2);
         auto H = q.size(1) / S;
         auto h_each_group_len = 1;
@@ -1785,9 +2172,10 @@ struct AttentionExecutor : public PagedAttentionExecutor {
         q = q.reshape({B_token, H, 1, S});
         k = k.reshape({B_token, Hk, 1, S});
         v = v.reshape({B_token, Hk, 1, SV});
+
         if (k_cache.m_dt == ov::element::Type_t::u8) {
-            k_cache.assert_dims({0, Hk, block_size, S + sizeof(float) * 2}, true);
-            v_cache.assert_dims({k_cache.m_dims[0], Hk, block_size, SV + sizeof(float) * 2});
+            k_cache.assert_dims({0, Hk, block_size, S + key_params_size * key_group_num}, true);
+            v_cache.assert_dims({k_cache.m_dims[0], Hk, block_size, SV + value_params_size * value_group_num});
         } else {
             k_cache.assert_dims({0, Hk, block_size, S}, true);
             v_cache.assert_dims({k_cache.m_dims[0], Hk, block_size, SV});
@@ -1837,7 +2225,13 @@ struct AttentionExecutor : public PagedAttentionExecutor {
         }
 
         if (k_cache.m_dt == ov::element::Type_t::u8) {
-            paged_attn_quantkv(k, v, k_cache, v_cache, _slot_mapping);
+            paged_attn_quantkv(k,
+                               v,
+                               k_cache,
+                               v_cache,
+                               _slot_mapping,
+                               _helper._key_group_size,
+                               _helper._value_group_size);
         } else {
             paged_attn_memcpy(k, v, k_cache, v_cache, _slot_mapping);
         }
@@ -1887,40 +2281,79 @@ struct AttentionExecutor : public PagedAttentionExecutor {
 };
 #endif
 
-std::shared_ptr<PagedAttentionExecutor> make_pa_executor(ov::element::Type data_type, ov::element::Type kvcache_type) {
+std::shared_ptr<PagedAttentionExecutor> make_pa_executor(ov::element::Type data_type,
+                                                         ov::element::Type key_cache_type,
+                                                         ov::element::Type value_cache_type,
+                                                         size_t key_group_size,
+                                                         size_t value_group_size) {
     std::shared_ptr<PagedAttentionExecutor> executor;
 
 #ifdef OPENVINO_ARCH_X86_64
     if (data_type == ov::element::bf16) {
 #    if defined(HAVE_AVX512F)
-        if (kvcache_type == ov::element::u8) {
-            executor = std::make_shared<AttentionExecutor<ov::bfloat16, uint8_t>>();
+        if (key_cache_type == ov::element::u8) {
+            if (value_cache_type == ov::element::u4) {
+                executor =
+                    std::make_shared<AttentionExecutor<ov::bfloat16, uint8_t, ov::element::u4>>(key_group_size,
+                                                                                                value_group_size);
+            } else if (value_cache_type == ov::element::u8) {
+                executor =
+                    std::make_shared<AttentionExecutor<ov::bfloat16, uint8_t, ov::element::u8>>(key_group_size,
+                                                                                                value_group_size);
+            } else {
+                OPENVINO_THROW("make_pa_executor: key_cache_type u8 with value_cache_type ",
+                               value_cache_type.to_string(),
+                               " is not support");
+            }
+
         } else {
-            OPENVINO_ASSERT(kvcache_type == ov::element::bf16, "expect kvcache type bf16, current: ", kvcache_type);
-            executor = std::make_shared<AttentionExecutor<ov::bfloat16, ov::bfloat16>>();
+            OPENVINO_ASSERT(key_cache_type == ov::element::bf16, "expect kvcache type bf16, current: ", key_cache_type);
+            executor = std::make_shared<AttentionExecutor<ov::bfloat16, ov::bfloat16, ov::element::bf16>>();
         }
 #    else
         OPENVINO_THROW("make_pa_executor: bf16 needs avx512+ hardware.");
 #    endif
     } else if (data_type == ov::element::f16) {
 #    if defined(HAVE_AVX512F)
-        if (kvcache_type == ov::element::u8) {
-            executor = std::make_shared<AttentionExecutor<ov::float16, uint8_t>>();
+        if (key_cache_type == ov::element::u8) {
+            if (value_cache_type == ov::element::u4) {
+                executor = std::make_shared<AttentionExecutor<ov::float16, uint8_t, ov::element::u4>>(key_group_size,
+                                                                                                      value_group_size);
+            } else if (value_cache_type == ov::element::u8) {
+                executor = std::make_shared<AttentionExecutor<ov::float16, uint8_t, ov::element::u8>>(key_group_size,
+                                                                                                      value_group_size);
+            } else {
+                OPENVINO_THROW("make_pa_executor: key_cache_type u8 with value_cache_type ",
+                               value_cache_type.to_string(),
+                               " is not support");
+            }
         } else {
-            OPENVINO_ASSERT(kvcache_type == ov::element::f16, "expect kvcache type f16, current: ", kvcache_type);
-            executor = std::make_shared<AttentionExecutor<ov::float16, ov::float16>>();
+            OPENVINO_ASSERT(key_cache_type == ov::element::f16, "expect kvcache type f16, current: ", key_cache_type);
+            executor = std::make_shared<AttentionExecutor<ov::float16, ov::float16, ov::element::f16>>();
         }
 #    else
         OPENVINO_THROW("make_pa_executor: f16 needs avx512+ hardware.");
 #    endif
     } else if (data_type == ov::element::f32) {
-        if (kvcache_type == ov::element::u8) {
-            executor = std::make_shared<AttentionExecutor<float, uint8_t>>();
-        } else if (kvcache_type == ov::element::f16) {
-            executor = std::make_shared<AttentionExecutor<float, ov::float16>>();
+        if (key_cache_type == ov::element::u8) {
+            if (value_cache_type == ov::element::u4) {
+                executor = std::make_shared<AttentionExecutor<float, uint8_t, ov::element::u4>>(key_group_size,
+                                                                                                value_group_size);
+            } else if (value_cache_type == ov::element::u8) {
+                executor = std::make_shared<AttentionExecutor<float, uint8_t, ov::element::u8>>(key_group_size,
+                                                                                                value_group_size);
+            } else {
+                OPENVINO_THROW("make_pa_executor: key_cache_type u8 with value_cache_type ",
+                               value_cache_type.to_string(),
+                               " is not support");
+            }
+        } else if (key_cache_type == ov::element::f16) {
+            executor = std::make_shared<AttentionExecutor<float, ov::float16, ov::element::f16>>(key_group_size,
+                                                                                                 value_group_size);
         } else {
-            OPENVINO_ASSERT(kvcache_type == ov::element::f32, "expect kvcache type f32, current: ", kvcache_type);
-            executor = std::make_shared<AttentionExecutor<float, float>>();
+            OPENVINO_ASSERT(key_cache_type == ov::element::f32, "expect kvcache type f32, current: ", key_cache_type);
+            executor =
+                std::make_shared<AttentionExecutor<float, float, ov::element::f32>>(key_group_size, value_group_size);
         }
     } else {
         OPENVINO_THROW("make_pa_executor: unsupported precision: ", data_type);
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp
index d28125b3898460..64e4eefc3b760d 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp
@@ -17,7 +17,11 @@ namespace Extensions {
 namespace Cpu {
 namespace XARCH {
 
-std::shared_ptr<PagedAttentionExecutor> make_pa_executor(ov::element::Type data_type, ov::element::Type kvcache_type);
+std::shared_ptr<PagedAttentionExecutor> make_pa_executor(ov::element::Type data_type,
+                                                         ov::element::Type key_cache_type,
+                                                         ov::element::Type value_cache_type,
+                                                         size_t key_group_size,
+                                                         size_t value_group_size);
 
 }  // namespace XARCH
 }  // namespace Cpu
diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
index b51b2b3d8029a9..54aa80e9dff7c0 100644
--- a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
@@ -84,13 +84,14 @@ void PagedAttention::initSupportedPrimitiveDescriptors() {
 
     OPENVINO_ASSERT(orgInputNumber == 13, "The input number of PagedAttention should be 13.");
     // kvcache, float, []
-    auto past_kv_input_mem_precision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_KCACHE);
+    auto past_key_input_mem_precision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_KCACHE);
+    auto past_value_input_mem_precision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_VCACHE);
     config.inConfs[PagedAttentionExecutor::ID_KCACHE].setMemDesc(
         creatorsMap.at(LayoutType::ncsp)
-            ->createSharedDesc(past_kv_input_mem_precision, getInputShapeAtPort(PagedAttentionExecutor::ID_KCACHE)));
+            ->createSharedDesc(past_key_input_mem_precision, getInputShapeAtPort(PagedAttentionExecutor::ID_KCACHE)));
     config.inConfs[PagedAttentionExecutor::ID_VCACHE].setMemDesc(
         creatorsMap.at(LayoutType::ncsp)
-            ->createSharedDesc(past_kv_input_mem_precision, getInputShapeAtPort(PagedAttentionExecutor::ID_VCACHE)));
+            ->createSharedDesc(past_value_input_mem_precision, getInputShapeAtPort(PagedAttentionExecutor::ID_VCACHE)));
     // past_lens, int, [b_seq]
     config.inConfs[PagedAttentionExecutor::ID_PAST_LENS].setMemDesc(
         creatorsMap.at(LayoutType::ncsp)
@@ -140,8 +141,14 @@ void PagedAttention::createPrimitive() {
 
     auto builder = [&](const PagedAttentionKey& key) -> std::shared_ptr<PagedAttentionExecutor> {
 #ifdef OPENVINO_ARCH_X86_64
-        auto kvCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_KCACHE);
-        return make_pa_executor(rtPrecision, kvCachePrecision);
+        // Since we are quantize only last dim it's safe to use the last dim of KV.
+        auto kCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_KCACHE);
+        auto vCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_VCACHE);
+        const auto& cpuConfig = context->getConfig();
+
+        size_t key_group_size = cpuConfig.keyCacheGroupSize;
+        size_t value_group_size = cpuConfig.valueCacheGroupSize;
+        return make_pa_executor(rtPrecision, kCachePrecision, vCachePrecision, key_group_size, value_group_size);
 #else
         return nullptr;
 #endif
@@ -202,6 +209,20 @@ void PagedAttention::execute(dnnl::stream strm) {
 bool PagedAttention::isSupportedOperation(const std::shared_ptr<const ov::Node>& op,
                                           std::string& errorMessage) noexcept {
     try {
+        auto vCachePrecision = op->get_input_element_type(PagedAttentionExecutor::ID_VCACHE);
+        auto kCachePrecision = op->get_input_element_type(PagedAttentionExecutor::ID_KCACHE);
+        if (one_of(vCachePrecision,
+                   ov::element::u4,
+                   ov::element::u8,
+                   ov::element::f32,
+                   ov::element::f16,
+                   ov::element::bf16)) {
+            if (!one_of(kCachePrecision, ov::element::u8, ov::element::f16, ov::element::f32, ov::element::bf16)) {
+                errorMessage = "PageAttn key value cache compression doesn't support key cache prec " +
+                               kCachePrecision.to_string() + " value cache prec " + vCachePrecision.to_string();
+                return false;
+            }
+        }
         int orgInput = static_cast<int>(op->get_input_size());
         if (op->get_type_name() == std::string("PagedAttentionExtension") &&
             orgInput == PagedAttentionExecutor::ID_SLIDING_WINDOW + 1) {
diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
index 71d39a1fce5ba8..a4af32ce07046a 100644
--- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
@@ -436,12 +436,14 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
             }
             T* v_ptr = is_xf16 ? &wv_scratch_b.at<T>({b, h / h_each_group_len, 0})
                                : &present_value.at<T>({b, h / h_each_group_len, 0, 0});
-            wv_gemm_ptr->executeGemm(m_cnt < m_block_size,
-                                     w_ptr,
-                                     v_ptr,
-                                     fp32_out_ptr,
-                                     wsp.data() + tid * wsp_size_per_thread,
-                                     wv_scratch_a ? &wv_scratch_a.at<T>({tid, 0}) : nullptr);
+            wv_gemm_ptr->executeGemm(m_cnt<m_block_size,
+                                           w_ptr,
+                                           v_ptr,
+                                           fp32_out_ptr,
+                                           wsp.data() + tid * wsp_size_per_thread,
+                                           wv_gemm_ptr->get_scratch_a_size()> 0
+                                         ? &wv_scratch_a.at<T>({tid, 0})
+                                         : nullptr);
             if (is_xf16) {
                 if (has_out_transpose) {
                     attn_memcpy2d_kernel(&fp32_out.at<float>({b, m_start, h, 0}),
@@ -1059,6 +1061,26 @@ ScaledDotProductAttention::ScaledDotProductAttention(const std::shared_ptr<ov::N
     if (!isSupportedOperation(op, errorMessage)) {
         OPENVINO_THROW("CPU: " + errorMessage);
     }
+    const auto& cpuConfig = context->getConfig();
+    const auto& keyCachePrecision = cpuConfig.keyCachePrecision;
+    const auto& valueCachePrecision = cpuConfig.valueCachePrecision;
+    const auto keyDims = getInputShapeAtPort(1).getDims();
+    const auto valueDims = getInputShapeAtPort(2).getDims();
+    const auto keyS = *(keyDims.end() - 1);
+    const auto valueS = *(valueDims.end() - 1);
+    OPENVINO_ASSERT(valueCachePrecision == keyCachePrecision,
+                    "CPU: SDPA node only supports same key/value cache precision");
+    OPENVINO_ASSERT(one_of(keyCachePrecision, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8),
+                    "CPU: SDPA only supports key/value cache precision f32, f16, bf16, u8 but gets ",
+                    keyCachePrecision);
+    m_key_quant_param.groupSize = (cpuConfig.keyCacheGroupSize == 0 || keyS % cpuConfig.keyCacheGroupSize != 0)
+                                      ? keyS
+                                      : cpuConfig.keyCacheGroupSize;
+    m_key_quant_param.precision = keyCachePrecision;
+    m_value_quant_param.groupSize = (cpuConfig.valueCacheGroupSize == 0 || valueS % cpuConfig.valueCacheGroupSize != 0)
+                                        ? valueS
+                                        : cpuConfig.valueCacheGroupSize;
+    m_key_quant_param.precision = valueCachePrecision;
 
     if (const auto node = std::dynamic_pointer_cast<const ov::op::v13::ScaledDotProductAttention>(op)) {
         m_config.config.is_causal = node->get_causal();
@@ -1833,12 +1855,16 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
 
 ov::element::Type ScaledDotProductAttention::getKVCachePrecision() {
     ov::element::Type kvcache_precision;
+    // TODO: SDPA only supports same key/value cache precision.
     auto rtPrecision = getRuntimePrecision();
-    auto kvCachePrecisionHint = context->getConfig().kvCachePrecision;
+    auto keyCachePrecisionHint = context->getConfig().keyCachePrecision;
+    auto valueCachePrecisionHint = context->getConfig().valueCachePrecision;
     bool enableKVCacheFP16 = m_config.config.fuse_concat && mayiuse(cpu_isa_t::avx2) &&
-                             rtPrecision != ov::element::bf16 && kvCachePrecisionHint == ov::element::f16;
+                             rtPrecision != ov::element::bf16 &&
+                             (keyCachePrecisionHint == ov::element::f16 && valueCachePrecisionHint == ov::element::f16);
     kvcache_precision = enableKVCacheFP16 ? ov::element::f16 : rtPrecision;
-    bool use_int8_kv_cache_precision = kvCachePrecisionHint == ov::element::u8;
+    bool use_int8_kv_cache_precision =
+        (keyCachePrecisionHint == ov::element::u8 && valueCachePrecisionHint == ov::element::u8);
     if (use_int8_kv_cache_precision)
         kvcache_precision = ov::element::u8;
     else
diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.h b/src/plugins/intel_cpu/src/nodes/scaled_attn.h
index 21b9056ba9517c..2917342314fafd 100644
--- a/src/plugins/intel_cpu/src/nodes/scaled_attn.h
+++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.h
@@ -47,7 +47,10 @@ class ScaledDotProductAttention : public Node {
             real_order = {permute_axes[2], permute_axes[0], permute_axes[1], permute_axes[3]};
         return real_order;
     }
-
+    struct SDPAQuantParam {
+        ov::element::Type precision = ov::element::undefined;
+        size_t groupSize = 0;
+    };
     ov::element::Type getKVCachePrecision();
 
 private:
@@ -86,6 +89,8 @@ class ScaledDotProductAttention : public Node {
     // (0, 1, 2, 3) for BHLS
     // (2, 0, 1, 3) for LBHS
     std::vector<size_t> m_kvstate_layout = {2, 0, 1, 3};
+    SDPAQuantParam m_key_quant_param;
+    SDPAQuantParam m_value_quant_param;
 };
 
 }  // namespace node
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index c0d8d655a753c8..103c33b6c00be4 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -377,6 +377,14 @@ ov::Any Plugin::get_property(const std::string& name, const ov::AnyMap& options)
             engConfig.fcDynamicQuantizationGroupSize);
     } else if (name == ov::hint::kv_cache_precision) {
         return decltype(ov::hint::kv_cache_precision)::value_type(engConfig.kvCachePrecision);
+    } else if (name == ov::key_cache_precision) {
+        return decltype(ov::key_cache_precision)::value_type(engConfig.keyCachePrecision);
+    } else if (name == ov::value_cache_precision) {
+        return decltype(ov::value_cache_precision)::value_type(engConfig.valueCachePrecision);
+    } else if (name == ov::key_cache_group_size) {
+        return decltype(ov::key_cache_group_size)::value_type(engConfig.keyCacheGroupSize);
+    } else if (name == ov::value_cache_group_size) {
+        return decltype(ov::value_cache_group_size)::value_type(engConfig.valueCacheGroupSize);
     }
     return get_ro_property(name, options);
 }
@@ -420,6 +428,10 @@ ov::Any Plugin::get_ro_property(const std::string& name, const ov::AnyMap& optio
             RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
             RW_property(ov::hint::dynamic_quantization_group_size.name()),
             RW_property(ov::hint::kv_cache_precision.name()),
+            RW_property(ov::key_cache_precision.name()),
+            RW_property(ov::value_cache_precision.name()),
+            RW_property(ov::key_cache_group_size.name()),
+            RW_property(ov::value_cache_group_size.name()),
         };
 
         std::vector<ov::PropertyName> supportedProperties;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
index 073faba7f8d96f..6655a2a5e7d48d 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
@@ -2,14 +2,15 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "openvino/runtime/properties.hpp"
+
 #include <gtest/gtest.h>
 
-#include "utils/properties_test.hpp"
-#include "openvino/runtime/system_conf.hpp"
-#include "openvino/runtime/core.hpp"
 #include "openvino/runtime/compiled_model.hpp"
-#include "openvino/runtime/properties.hpp"
+#include "openvino/runtime/core.hpp"
 #include "openvino/runtime/intel_cpu/properties.hpp"
+#include "openvino/runtime/system_conf.hpp"
+#include "utils/properties_test.hpp"
 
 namespace {
 
@@ -40,6 +41,10 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable
         RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
         RO_property(ov::hint::dynamic_quantization_group_size.name()),
         RO_property(ov::hint::kv_cache_precision.name()),
+        RO_property(ov::key_cache_precision.name()),
+        RO_property(ov::value_cache_precision.name()),
+        RO_property(ov::key_cache_group_size.name()),
+        RO_property(ov::value_cache_group_size.name()),
     };
 
     ov::Core ie;
@@ -83,7 +88,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSetROPropertiesThrow) {
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriorityThanThroughputHint) {
     ov::Core ie;
-    int32_t streams = 1; // throughput hint should apply higher number of streams
+    int32_t streams = 1;  // throughput hint should apply higher number of streams
     int32_t value = 0;
 
     OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::num_streams(streams)));
@@ -96,7 +101,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriori
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriorityThanLatencyHint) {
     ov::Core ie;
-    int32_t streams = ov::get_number_of_cpu_cores(); // latency hint should apply lower number of streams
+    int32_t streams = ov::get_number_of_cpu_cores();  // latency hint should apply lower number of streams
     int32_t value = 0;
 
     OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::num_streams(streams)));
@@ -109,7 +114,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriori
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelStreamsHasHigherPriorityThanLatencyHint) {
     ov::Core ie;
-    int32_t streams = ov::get_number_of_cpu_cores(); // latency hint should apply lower number of streams
+    int32_t streams = ov::get_number_of_cpu_cores();  // latency hint should apply lower number of streams
     int32_t value = 0;
 
     OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)));
@@ -124,7 +129,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelStreamsHasHigherPrior
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelStreamsHasHigherPriorityThanThroughputHint) {
     ov::Core ie;
-    int32_t streams = 1; // throughput hint should apply higher number of streams
+    int32_t streams = 1;  // throughput hint should apply higher number of streams
     int32_t value = 0;
 
     ov::AnyMap config;
@@ -182,6 +187,36 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckKVCachePrecision) {
     ASSERT_EQ(kv_cache_precision_value, ov::element::f32);
 }
 
+TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkFinetuneKVCachePrecision) {
+    ov::Core core;
+
+    core.set_property(deviceName, ov::key_cache_precision(ov::element::f16));
+    core.set_property(deviceName, ov::value_cache_precision(ov::element::u4));
+    ov::CompiledModel compiledModel = core.compile_model(model, deviceName);
+
+    auto key_cache_precision_value = ov::element::undefined;
+    auto value_cache_precision_value = ov::element::undefined;
+    OV_ASSERT_NO_THROW(key_cache_precision_value = compiledModel.get_property(ov::key_cache_precision));
+    OV_ASSERT_NO_THROW(value_cache_precision_value = compiledModel.get_property(ov::value_cache_precision));
+    ASSERT_EQ(key_cache_precision_value, ov::element::f16);
+    ASSERT_EQ(value_cache_precision_value, ov::element::u4);
+}
+
+TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkFinetuneKVCacheGroupSize) {
+    ov::Core core;
+
+    core.set_property(deviceName, ov::key_cache_group_size(32));
+    core.set_property(deviceName, ov::value_cache_group_size(16));
+    ov::CompiledModel compiledModel = core.compile_model(model, deviceName);
+
+    auto key_cache_group_size_value = 0;
+    auto value_cache_group_size_value = 0;
+    OV_ASSERT_NO_THROW(key_cache_group_size_value = compiledModel.get_property(ov::key_cache_group_size));
+    OV_ASSERT_NO_THROW(value_cache_group_size_value = compiledModel.get_property(ov::value_cache_group_size));
+    ASSERT_EQ(key_cache_group_size_value, 32);
+    ASSERT_EQ(value_cache_group_size_value, 16);
+}
+
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckAccuracyModeDynamicQuantizationGroupSize) {
     ov::Core core;
 
@@ -225,7 +260,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckExecutionModeIsAvailableIn
     ASSERT_FALSE(model_exec_mode_it->is_mutable());
 }
 
-TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCoreInferencePrecision) {
+TEST_F(OVClassConfigTestCPU,
+       smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCoreInferencePrecision) {
     ov::Core ie;
     auto inference_precision_value = ov::element::undefined;
 
@@ -239,7 +275,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelInferencePrecisionHas
     ASSERT_EQ(inference_precision_value, bf16_if_can_be_emulated);
 }
 
-TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreInferencePrecisionHasHigherPriorityThanModelPerformanceExecutionMode) {
+TEST_F(OVClassConfigTestCPU,
+       smoke_CpuExecNetworkCheckCoreInferencePrecisionHasHigherPriorityThanModelPerformanceExecutionMode) {
     ov::Core ie;
     auto execution_mode_value = ov::hint::ExecutionMode::ACCURACY;
     auto inference_precision_value = ov::element::undefined;
@@ -257,7 +294,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreInferencePrecisionHasH
     ASSERT_EQ(inference_precision_value, ov::element::f32);
 }
 
-TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCorePerformanceExecutionMode) {
+TEST_F(OVClassConfigTestCPU,
+       smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCorePerformanceExecutionMode) {
     ov::Core ie;
     auto execution_mode_value = ov::hint::ExecutionMode::PERFORMANCE;
     auto inference_precision_value = ov::element::undefined;
@@ -288,14 +326,13 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckLogLevel) {
         OV_ASSERT_NO_THROW(value = compiledModel.get_property(ov::log::level));
         ASSERT_EQ(value.as<ov::log::Level>(), ov::log::Level::NO);
     }
-    //check set and get
-    const std::vector<ov::log::Level> logLevels = {
-        ov::log::Level::ERR,
-        ov::log::Level::NO,
-        ov::log::Level::WARNING,
-        ov::log::Level::INFO,
-        ov::log::Level::DEBUG,
-        ov::log::Level::TRACE};
+    // check set and get
+    const std::vector<ov::log::Level> logLevels = {ov::log::Level::ERR,
+                                                   ov::log::Level::NO,
+                                                   ov::log::Level::WARNING,
+                                                   ov::log::Level::INFO,
+                                                   ov::log::Level::DEBUG,
+                                                   ov::log::Level::TRACE};
 
     for (unsigned int i = 0; i < logLevels.size(); i++) {
         ov::Any value;
@@ -330,50 +367,109 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) {
     ov::Core ie;
     ov::Any type;
     ov::Any size;
+    ov::Any keySize;
+    ov::Any valueSize;
+    ov::Any keyCacheType;
+    ov::Any valueCacheType;
     ov::CompiledModel compiledModel;
     model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
     model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    model->set_rt_info("32", "runtime_options", ov::key_cache_group_size.name());
+    model->set_rt_info("16", "runtime_options", ov::value_cache_group_size.name());
+    model->set_rt_info("u8", "runtime_options", ov::key_cache_precision.name());
+    model->set_rt_info("u8", "runtime_options", ov::value_cache_precision.name());
     OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName));
     OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
     OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(keySize = compiledModel.get_property(ov::key_cache_group_size));
+    OV_ASSERT_NO_THROW(valueSize = compiledModel.get_property(ov::value_cache_group_size));
+    OV_ASSERT_NO_THROW(keyCacheType = compiledModel.get_property(ov::key_cache_precision));
+    OV_ASSERT_NO_THROW(valueCacheType = compiledModel.get_property(ov::value_cache_precision));
     ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f16);
     ASSERT_EQ(size.as<uint64_t>(), 0);
+    ASSERT_EQ(keySize.as<uint64_t>(), 32);
+    ASSERT_EQ(valueSize.as<uint64_t>(), 16);
+    ASSERT_EQ(keyCacheType.as<ov::element::Type>(), ov::element::u8);
+    ASSERT_EQ(valueCacheType.as<ov::element::Type>(), ov::element::u8);
 }
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompileConfig) {
     ov::Core ie;
     ov::Any type;
     ov::Any size;
+    ov::Any keySize;
+    ov::Any valueSize;
+    ov::Any keyCacheType;
+    ov::Any valueCacheType;
     ov::CompiledModel compiledModel;
     model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
     model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    model->set_rt_info("0", "runtime_options", ov::key_cache_group_size.name());
+    model->set_rt_info("0", "runtime_options", ov::value_cache_group_size.name());
+    model->set_rt_info("f32", "runtime_options", ov::key_cache_precision.name());
+    model->set_rt_info("f32", "runtime_options", ov::value_cache_precision.name());
     ov::AnyMap config;
     config[ov::hint::kv_cache_precision.name()] = "u8";
     config[ov::hint::dynamic_quantization_group_size.name()] = "16";
+    // propperty has higher priority than rt_info
+    config[ov::key_cache_group_size.name()] = "32";
+    config[ov::value_cache_group_size.name()] = "16";
+    // key/value cache prec has higher priority than kvCachePrec
+    config[ov::key_cache_precision.name()] = "f16";
+    config[ov::value_cache_precision.name()] = "bf16";
     OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName, config));
     OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
     OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(keySize = compiledModel.get_property(ov::key_cache_group_size));
+    OV_ASSERT_NO_THROW(valueSize = compiledModel.get_property(ov::value_cache_group_size));
+    OV_ASSERT_NO_THROW(keyCacheType = compiledModel.get_property(ov::key_cache_precision));
+    OV_ASSERT_NO_THROW(valueCacheType = compiledModel.get_property(ov::value_cache_precision));
     ASSERT_EQ(type.as<ov::element::Type>(), ov::element::u8);
     ASSERT_EQ(size.as<uint64_t>(), 16);
+    ASSERT_EQ(keySize.as<uint64_t>(), 32);
+    ASSERT_EQ(valueSize.as<uint64_t>(), 16);
+    ASSERT_EQ(keyCacheType.as<ov::element::Type>(), ov::element::f16);
+    ASSERT_EQ(valueCacheType.as<ov::element::Type>(), ov::element::bf16);
 }
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCoreProperties) {
     ov::Core core;
     ov::Any type;
     ov::Any size;
-
+    ov::Any keySize;
+    ov::Any valueSize;
+    ov::Any keyCacheType;
+    ov::Any valueCacheType;
     core.set_property(deviceName, ov::hint::kv_cache_precision(ov::element::f32));
     core.set_property(deviceName, ov::hint::dynamic_quantization_group_size(16));
+    core.set_property(deviceName, ov::key_cache_group_size(8));
+    core.set_property(deviceName, ov::value_cache_group_size(8));
+    core.set_property(deviceName, ov::key_cache_precision(ov::element::f16));
+    core.set_property(deviceName, ov::value_cache_precision(ov::element::bf16));
 
     ov::CompiledModel compiledModel;
     model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
     model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    model->set_rt_info("32", "runtime_options", ov::key_cache_group_size.name());
+    model->set_rt_info("16", "runtime_options", ov::value_cache_group_size.name());
+    // User's setting has higher priority than rt_info
+    model->set_rt_info("f32", "runtime_options", ov::key_cache_precision.name());
+    model->set_rt_info("f32", "runtime_options", ov::value_cache_precision.name());
 
     OV_ASSERT_NO_THROW(compiledModel = core.compile_model(model, deviceName));
     OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
     OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(keySize = compiledModel.get_property(ov::key_cache_group_size));
+    OV_ASSERT_NO_THROW(valueSize = compiledModel.get_property(ov::value_cache_group_size));
+    OV_ASSERT_NO_THROW(keyCacheType = compiledModel.get_property(ov::key_cache_precision));
+    OV_ASSERT_NO_THROW(valueCacheType = compiledModel.get_property(ov::value_cache_precision));
+
     ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f32);
     ASSERT_EQ(size.as<uint64_t>(), 16);
+    ASSERT_EQ(keySize.as<uint64_t>(), 8);
+    ASSERT_EQ(valueSize.as<uint64_t>(), 8);
+    ASSERT_EQ(keyCacheType.as<ov::element::Type>(), ov::element::f16);
+    ASSERT_EQ(valueCacheType.as<ov::element::Type>(), ov::element::bf16);
 }
 
-} // namespace
+}  // namespace
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
index f72df3f58b69e5..5adf6cbb125185 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
@@ -55,6 +55,10 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) {
         RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
         RW_property(ov::hint::dynamic_quantization_group_size.name()),
         RW_property(ov::hint::kv_cache_precision.name()),
+        RW_property(ov::key_cache_precision.name()),
+        RW_property(ov::value_cache_precision.name()),
+        RW_property(ov::key_cache_group_size.name()),
+        RW_property(ov::value_cache_group_size.name()),
     };
 
     ov::Core ie;