From c9672e012ef631b21a056b3b0b8bfd986e72b864 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Fri, 29 Nov 2024 19:49:04 +0400 Subject: [PATCH] [GPU] Update docs related to KV-cache quantization (#27821) ### Details: - Update docs related to KV-cache quantization on GPU - Allow to use `element::u8` as data type for KV-cache quantization to be aligned with CPU Plugin --- .../learn-openvino/llm_inference_guide/llm-inference-hf.rst | 5 +++-- .../src/plugin/transformations/kv_cache_compression.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst index a26b670b5314d0..7bf2107482bd3a 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst @@ -276,9 +276,10 @@ includes **Dynamic quantization** of activations of 4/8-bit quantized MatMuls an ov_config={"KV_CACHE_PRECISION": "u8", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", "PERFORMANCE_HINT": "LATENCY"} ) -.. note:: + .. note:: + Currently, for KV-cache quantization, GPU ignores the DYNAMIC_QUANTIZATION_GROUP_SIZE property, using ``group_size = head_size``. Additionally, it does not support the ``get_state()`` and ``set_state()`` APIs when KV-cache quantization is enabled. - Currently, both Dynamic quantization and KV-cache quantization are available for CPU device. + For GPU, KV-cache quantization is enabled by default on platforms without XMX support, and can be disabled by setting KV_CACHE_PRECISION to ``undefined``. Working with Models Tuned with LoRA diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp index 561822f9661109..6903b52963a879 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -133,7 +133,7 @@ class KVCacheCompressionMatcher : public ov::pass::MatcherPass { KVCacheCompressionMatcher::KVCacheCompressionMatcher(ov::element::Type compression_dt) { using namespace ov::pass::pattern; - if (compression_dt != element::i8) + if (compression_dt != element::i8 && compression_dt != element::u8) return; const auto quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric;