From c9672e012ef631b21a056b3b0b8bfd986e72b864 Mon Sep 17 00:00:00 2001
From: Sergey Shlyapnikov <sergey.shlyapnikov@intel.com>
Date: Fri, 29 Nov 2024 19:49:04 +0400
Subject: [PATCH] [GPU] Update docs related to KV-cache quantization (#27821)

### Details:
 - Update docs related to KV-cache quantization on GPU
- Allow to use `element::u8` as data type for KV-cache quantization to
be aligned with CPU Plugin
---
 .../learn-openvino/llm_inference_guide/llm-inference-hf.rst  | 5 +++--
 .../src/plugin/transformations/kv_cache_compression.cpp      | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst
index a26b670b5314d0..7bf2107482bd3a 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst
@@ -276,9 +276,10 @@ includes **Dynamic quantization** of activations of 4/8-bit quantized MatMuls an
          ov_config={"KV_CACHE_PRECISION": "u8", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", "PERFORMANCE_HINT": "LATENCY"}
      )
 
-.. note::
+  .. note::
+     Currently, for KV-cache quantization, GPU ignores the DYNAMIC_QUANTIZATION_GROUP_SIZE property, using ``group_size = head_size``. Additionally, it does not support the ``get_state()`` and ``set_state()`` APIs when KV-cache quantization is enabled.
 
-   Currently, both Dynamic quantization and KV-cache quantization are available for CPU device.
+     For GPU, KV-cache quantization is enabled by default on platforms without XMX support, and can be disabled by setting KV_CACHE_PRECISION to ``undefined``.
 
 
 Working with Models Tuned with LoRA
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp
index 561822f9661109..6903b52963a879 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp
@@ -133,7 +133,7 @@ class KVCacheCompressionMatcher : public ov::pass::MatcherPass {
 KVCacheCompressionMatcher::KVCacheCompressionMatcher(ov::element::Type compression_dt) {
     using namespace ov::pass::pattern;
 
-    if (compression_dt != element::i8)
+    if (compression_dt != element::i8 && compression_dt != element::u8)
         return;
 
     const auto quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric;