From 75e1ddae5afcc4bea078aa51f4d7b5416a05320a Mon Sep 17 00:00:00 2001 From: "Min, Byung-il" Date: Mon, 23 Dec 2024 20:52:44 +0900 Subject: [PATCH] [GPU] Applied comments Signed-off-by: Min, Byung-il --- .../fully_connected_gpu_bf_tiled.cl | 22 +++++++++---------- .../fully_connected_kernel_bf_tiled.cpp | 22 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 7e2e788daca4e5..315f83ffedbd79 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -58,9 +58,9 @@ KERNEL(quantize_input)( } // Pair of quantizing_scale and quantized activation_sum for each group - quan_var[offset * 2] = CAT(CAT(convert_, INPUT0_TYPE), _rte)(quan_scale); + quan_var[offset * 2] = convert_half(quan_scale); #if COMPRESSED_WEIGHTS_INT8 - quan_var[(offset * 2) + 1] = CAT(CAT(convert_, INPUT0_TYPE), _rte)(quantized_sum); + quan_var[(offset * 2) + 1] = convert_half(quantized_sum); #endif } #else // !FC_KERNEL_DYNAMIC_QUANTIZE @@ -1014,25 +1014,25 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( // Next batch in_offset += (TILE_IN_B_PITCH * 2); - #if (PER_TOKEN_SIZE_DYN_QUANTIZE == 0) && (NUM_LOOP_IN_DYN_QUAN_GROUP == 1) - de_quantize_scale[bi * 2] = TO_INPUT0_TYPE(quan_var[scale_offset * 2]); - de_quantize_scale[bi * 2 + 1] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + scale_pitch * 2]); + #if !PER_TOKEN_SIZE_DYN_QUANTIZE && (NUM_LOOP_IN_DYN_QUAN_GROUP == 1) + de_quantize_scale[bi * 2] = quan_var[scale_offset * 2]; + de_quantize_scale[bi * 2 + 1] = quan_var[scale_offset * 2 + scale_pitch * 2]; #if COMPRESSED_WEIGHTS_INT8 // Need additional accumulation of quantized activation along the dyn-quan group // to use i8 multiplier for int8 weight - activation_sum[bi * 2] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + 1]); - activation_sum[bi * 2 + 1] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + 1 + scale_pitch * 2]); + activation_sum[bi * 2] = quan_var[scale_offset * 2 + 1]; + activation_sum[bi * 2 + 1] = quan_var[scale_offset * 2 + 1 + scale_pitch * 2]; #endif scale_offset += (scale_pitch * 2); #endif } - #if (PER_TOKEN_SIZE_DYN_QUANTIZE == 0) && (NUM_LOOP_IN_DYN_QUAN_GROUP > 1) + #if !PER_TOKEN_SIZE_DYN_QUANTIZE && (NUM_LOOP_IN_DYN_QUAN_GROUP > 1) if (ni % NUM_LOOP_IN_DYN_QUAN_GROUP == 0) { unroll_for (uint bi = 0; bi < TILE_B; ++bi) { - de_quantize_scale[bi] = TO_INPUT0_TYPE(quan_var[scale_offset * 2]); + de_quantize_scale[bi] = quan_var[scale_offset * 2]; #if COMPRESSED_WEIGHTS_INT8 - activation_sum[bi] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + 1]); + activation_sum[bi] = quan_var[scale_offset * 2 + 1]; #endif scale_offset += scale_pitch; } @@ -1217,7 +1217,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #endif } // Whole tile_k elements of each iteration : ki - #if (PER_TOKEN_SIZE_DYN_QUANTIZE == 0) && DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE) + #if !PER_TOKEN_SIZE_DYN_QUANTIZE && DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE) // Dynamic-quantizing group size set to same or smaller than scale group size if ((ni % NUM_LOOP_IN_DYN_QUAN_GROUP) == (NUM_LOOP_IN_DYN_QUAN_GROUP - 1)) { const uint ni_offset = ((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index c5d2373cb97d0f..51dfac1b2b4fee 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -57,10 +57,10 @@ static size_t get_scale_group_size(const fully_connected_params& params) { return params.weights.IFM().v / params.decompression_scale.Feature().v; } -static bool is_dyn_quan_8bit_asym(const fully_connected_params& params) { +static bool is_8bit_asym_wei(const fully_connected_params& params) { auto weight_type = params.weights.GetDType(); // UINT8 weight type is supported by FC dyn-quantize(with SLM). - if (weight_type == WeightsType::UINT8) + if (weight_type == WeightsType::UINT8 && params.has_decompression_zp) return true; return false; @@ -70,7 +70,8 @@ static bool is_weight_dyn_quantizable(const fully_connected_params& params) { auto weight_type = params.weights.GetDType(); if (weight_type == WeightsType::INT4 || weight_type == WeightsType::UINT4) return true; - if (is_dyn_quan_8bit_asym(params)) + // No validated case of sym 8bit weight + if (is_8bit_asym_wei(params)) return true; return false; @@ -121,7 +122,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para dynamic_quantization_group_size = scale_group_size; // For int8 ASYM model, activation_sum should fit to weight zp - if (is_dyn_quan_8bit_asym(params) && params.has_decompression_zp == true && + if (is_8bit_asym_wei(params) && params.has_decompression_zp == true && dynamic_quantization_group_size > zp_group_size && (zp_group_size % input_load_size) == 0) { dynamic_quantization_group_size = zp_group_size; } @@ -753,10 +754,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1)); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size)); - if (is_per_token_dynamic_quantize(params) && quantize_grp_size == get_input_bf_size(params).second) - jit.AddConstant(MakeJitConstant("PER_TOKEN_SIZE_DYN_QUANTIZE", 1)); - else - jit.AddConstant(MakeJitConstant("PER_TOKEN_SIZE_DYN_QUANTIZE", 0)); + jit.AddConstant(MakeJitConstant("PER_TOKEN_SIZE_DYN_QUANTIZE", + is_per_token_dynamic_quantize(params) && quantize_grp_size == get_input_bf_size(params).second)); } else { if (add_decompress_scale_post_op) jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); @@ -897,7 +896,8 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const { size_t input_f = get_input_bf_size(prim_params).second; size_t input_size = input_f * dispatchData.tile_m * dispatchData.gws[2]; OPENVINO_ASSERT(quantize_grp_size != 0, "Error: quantize_grp_size is zero."); - size_t quan_var_size = (input_size / quantize_grp_size) * sizeof(float) * 2; + // half type of de_quan_scale and activation sum for each quantized group + size_t quan_var_size = (input_size / quantize_grp_size) * 2 * 2; if (kd.internalBufferSizes[0] < input_size || kd.internalBufferSizes[1] < quan_var_size) { @@ -1117,8 +1117,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); // char type quantized input kd.internalBufferSizes.push_back(input_size); - // float type of de_quan_scale and activation sum for each quantized group - kd.internalBufferSizes.push_back((input_size / quantize_grp_size) * sizeof(float) * 2); + // half type of de_quan_scale and activation sum for each quantized group + kd.internalBufferSizes.push_back((input_size / quantize_grp_size) * 2 * 2); kernel_number++; } kd.internalBufferDataType = Datatype::F16;