From 75e1ddae5afcc4bea078aa51f4d7b5416a05320a Mon Sep 17 00:00:00 2001
From: "Min, Byung-il" <byungil.min@intel.com>
Date: Mon, 23 Dec 2024 20:52:44 +0900
Subject: [PATCH] [GPU] Applied comments

Signed-off-by: Min, Byung-il <byungil.min@intel.com>
---
 .../fully_connected_gpu_bf_tiled.cl           | 22 +++++++++----------
 .../fully_connected_kernel_bf_tiled.cpp       | 22 +++++++++----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
index 7e2e788daca4e5..315f83ffedbd79 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -58,9 +58,9 @@ KERNEL(quantize_input)(
     }
 
     // Pair of quantizing_scale and quantized activation_sum for each group
-    quan_var[offset * 2] = CAT(CAT(convert_, INPUT0_TYPE), _rte)(quan_scale);
+    quan_var[offset * 2] = convert_half(quan_scale);
     #if COMPRESSED_WEIGHTS_INT8
-        quan_var[(offset * 2) + 1] = CAT(CAT(convert_, INPUT0_TYPE), _rte)(quantized_sum);
+        quan_var[(offset * 2) + 1] = convert_half(quantized_sum);
     #endif
 }
 #else  // !FC_KERNEL_DYNAMIC_QUANTIZE
@@ -1014,25 +1014,25 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
             // Next batch
             in_offset += (TILE_IN_B_PITCH * 2);
 
-            #if (PER_TOKEN_SIZE_DYN_QUANTIZE == 0) && (NUM_LOOP_IN_DYN_QUAN_GROUP == 1)
-                de_quantize_scale[bi * 2] = TO_INPUT0_TYPE(quan_var[scale_offset * 2]);
-                de_quantize_scale[bi * 2 + 1] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + scale_pitch * 2]);
+            #if !PER_TOKEN_SIZE_DYN_QUANTIZE && (NUM_LOOP_IN_DYN_QUAN_GROUP == 1)
+                de_quantize_scale[bi * 2] = quan_var[scale_offset * 2];
+                de_quantize_scale[bi * 2 + 1] = quan_var[scale_offset * 2 + scale_pitch * 2];
                 #if COMPRESSED_WEIGHTS_INT8
                     // Need additional accumulation of quantized activation along the dyn-quan group
                     //  to use i8 multiplier for int8 weight
-                    activation_sum[bi * 2] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + 1]);
-                    activation_sum[bi * 2 + 1] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + 1 + scale_pitch * 2]);
+                    activation_sum[bi * 2] = quan_var[scale_offset * 2 + 1];
+                    activation_sum[bi * 2 + 1] = quan_var[scale_offset * 2 + 1 + scale_pitch * 2];
                 #endif
                 scale_offset += (scale_pitch * 2);
             #endif
         }
 
-        #if (PER_TOKEN_SIZE_DYN_QUANTIZE == 0) && (NUM_LOOP_IN_DYN_QUAN_GROUP > 1)
+        #if !PER_TOKEN_SIZE_DYN_QUANTIZE && (NUM_LOOP_IN_DYN_QUAN_GROUP > 1)
             if (ni % NUM_LOOP_IN_DYN_QUAN_GROUP == 0) {
                 unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
-                    de_quantize_scale[bi] = TO_INPUT0_TYPE(quan_var[scale_offset * 2]);
+                    de_quantize_scale[bi] = quan_var[scale_offset * 2];
                     #if COMPRESSED_WEIGHTS_INT8
-                        activation_sum[bi] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + 1]);
+                        activation_sum[bi] = quan_var[scale_offset * 2 + 1];
                     #endif
                     scale_offset += scale_pitch;
                 }
@@ -1217,7 +1217,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
             #endif
         }  // Whole tile_k elements of each iteration : ki
 
-        #if (PER_TOKEN_SIZE_DYN_QUANTIZE == 0) && DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
+        #if !PER_TOKEN_SIZE_DYN_QUANTIZE && DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
             // Dynamic-quantizing group size set to same or smaller than scale group size
             if ((ni % NUM_LOOP_IN_DYN_QUAN_GROUP) == (NUM_LOOP_IN_DYN_QUAN_GROUP - 1)) {
                 const uint ni_offset = ((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
index c5d2373cb97d0f..51dfac1b2b4fee 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -57,10 +57,10 @@ static size_t get_scale_group_size(const fully_connected_params& params) {
     return params.weights.IFM().v / params.decompression_scale.Feature().v;
 }
 
-static bool is_dyn_quan_8bit_asym(const fully_connected_params& params) {
+static bool is_8bit_asym_wei(const fully_connected_params& params) {
     auto weight_type = params.weights.GetDType();
     // UINT8 weight type is supported by FC dyn-quantize(with SLM).
-    if (weight_type == WeightsType::UINT8)
+    if (weight_type == WeightsType::UINT8 && params.has_decompression_zp)
         return true;
 
     return false;
@@ -70,7 +70,8 @@ static bool is_weight_dyn_quantizable(const fully_connected_params& params) {
     auto weight_type = params.weights.GetDType();
     if (weight_type == WeightsType::INT4 || weight_type == WeightsType::UINT4)
         return true;
-    if (is_dyn_quan_8bit_asym(params))
+    // No validated case of sym 8bit weight
+    if (is_8bit_asym_wei(params))
         return true;
 
     return false;
@@ -121,7 +122,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
             dynamic_quantization_group_size = scale_group_size;
 
             // For int8 ASYM model, activation_sum should fit to weight zp
-            if (is_dyn_quan_8bit_asym(params) && params.has_decompression_zp == true &&
+            if (is_8bit_asym_wei(params) && params.has_decompression_zp == true &&
                 dynamic_quantization_group_size > zp_group_size && (zp_group_size % input_load_size) == 0) {
                 dynamic_quantization_group_size = zp_group_size;
             }
@@ -753,10 +754,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1));
         jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
 
-        if (is_per_token_dynamic_quantize(params) && quantize_grp_size == get_input_bf_size(params).second)
-            jit.AddConstant(MakeJitConstant("PER_TOKEN_SIZE_DYN_QUANTIZE", 1));
-        else
-            jit.AddConstant(MakeJitConstant("PER_TOKEN_SIZE_DYN_QUANTIZE", 0));
+        jit.AddConstant(MakeJitConstant("PER_TOKEN_SIZE_DYN_QUANTIZE",
+                                    is_per_token_dynamic_quantize(params) && quantize_grp_size == get_input_bf_size(params).second));
     } else {
         if (add_decompress_scale_post_op)
             jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
@@ -897,7 +896,8 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
                     size_t input_f = get_input_bf_size(prim_params).second;
                     size_t input_size = input_f * dispatchData.tile_m * dispatchData.gws[2];
                     OPENVINO_ASSERT(quantize_grp_size != 0, "Error: quantize_grp_size is zero.");
-                    size_t quan_var_size = (input_size / quantize_grp_size) * sizeof(float) * 2;
+                    // half type of de_quan_scale and activation sum for each quantized group
+                    size_t quan_var_size = (input_size / quantize_grp_size) * 2 * 2;
 
                     if (kd.internalBufferSizes[0] < input_size ||
                         kd.internalBufferSizes[1] < quan_var_size) {
@@ -1117,8 +1117,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
         quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
         // char type quantized input
         kd.internalBufferSizes.push_back(input_size);
-        // float type of de_quan_scale and activation sum for each quantized group
-        kd.internalBufferSizes.push_back((input_size / quantize_grp_size) * sizeof(float) * 2);
+        // half type of de_quan_scale and activation sum for each quantized group
+        kd.internalBufferSizes.push_back((input_size / quantize_grp_size) * 2 * 2);
         kernel_number++;
     }
     kd.internalBufferDataType = Datatype::F16;