Skip to content

Commit

Permalink
[GPU] Applied comments
Browse files Browse the repository at this point in the history
Signed-off-by: Min, Byung-il <[email protected]>
  • Loading branch information
byungilm committed Dec 23, 2024
1 parent a21b39e commit 75e1dda
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ KERNEL(quantize_input)(
}

// Pair of quantizing_scale and quantized activation_sum for each group
quan_var[offset * 2] = CAT(CAT(convert_, INPUT0_TYPE), _rte)(quan_scale);
quan_var[offset * 2] = convert_half(quan_scale);
#if COMPRESSED_WEIGHTS_INT8
quan_var[(offset * 2) + 1] = CAT(CAT(convert_, INPUT0_TYPE), _rte)(quantized_sum);
quan_var[(offset * 2) + 1] = convert_half(quantized_sum);
#endif
}
#else // !FC_KERNEL_DYNAMIC_QUANTIZE
Expand Down Expand Up @@ -1014,25 +1014,25 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
// Next batch
in_offset += (TILE_IN_B_PITCH * 2);

#if (PER_TOKEN_SIZE_DYN_QUANTIZE == 0) && (NUM_LOOP_IN_DYN_QUAN_GROUP == 1)
de_quantize_scale[bi * 2] = TO_INPUT0_TYPE(quan_var[scale_offset * 2]);
de_quantize_scale[bi * 2 + 1] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + scale_pitch * 2]);
#if !PER_TOKEN_SIZE_DYN_QUANTIZE && (NUM_LOOP_IN_DYN_QUAN_GROUP == 1)
de_quantize_scale[bi * 2] = quan_var[scale_offset * 2];
de_quantize_scale[bi * 2 + 1] = quan_var[scale_offset * 2 + scale_pitch * 2];
#if COMPRESSED_WEIGHTS_INT8
// Need additional accumulation of quantized activation along the dyn-quan group
// to use i8 multiplier for int8 weight
activation_sum[bi * 2] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + 1]);
activation_sum[bi * 2 + 1] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + 1 + scale_pitch * 2]);
activation_sum[bi * 2] = quan_var[scale_offset * 2 + 1];
activation_sum[bi * 2 + 1] = quan_var[scale_offset * 2 + 1 + scale_pitch * 2];
#endif
scale_offset += (scale_pitch * 2);
#endif
}

#if (PER_TOKEN_SIZE_DYN_QUANTIZE == 0) && (NUM_LOOP_IN_DYN_QUAN_GROUP > 1)
#if !PER_TOKEN_SIZE_DYN_QUANTIZE && (NUM_LOOP_IN_DYN_QUAN_GROUP > 1)
if (ni % NUM_LOOP_IN_DYN_QUAN_GROUP == 0) {
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
de_quantize_scale[bi] = TO_INPUT0_TYPE(quan_var[scale_offset * 2]);
de_quantize_scale[bi] = quan_var[scale_offset * 2];
#if COMPRESSED_WEIGHTS_INT8
activation_sum[bi] = TO_INPUT0_TYPE(quan_var[scale_offset * 2 + 1]);
activation_sum[bi] = quan_var[scale_offset * 2 + 1];
#endif
scale_offset += scale_pitch;
}
Expand Down Expand Up @@ -1217,7 +1217,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
#endif
} // Whole tile_k elements of each iteration : ki

#if (PER_TOKEN_SIZE_DYN_QUANTIZE == 0) && DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
#if !PER_TOKEN_SIZE_DYN_QUANTIZE && DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
// Dynamic-quantizing group size set to same or smaller than scale group size
if ((ni % NUM_LOOP_IN_DYN_QUAN_GROUP) == (NUM_LOOP_IN_DYN_QUAN_GROUP - 1)) {
const uint ni_offset = ((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ static size_t get_scale_group_size(const fully_connected_params& params) {
return params.weights.IFM().v / params.decompression_scale.Feature().v;
}

static bool is_dyn_quan_8bit_asym(const fully_connected_params& params) {
static bool is_8bit_asym_wei(const fully_connected_params& params) {
auto weight_type = params.weights.GetDType();
// UINT8 weight type is supported by FC dyn-quantize(with SLM).
if (weight_type == WeightsType::UINT8)
if (weight_type == WeightsType::UINT8 && params.has_decompression_zp)
return true;

return false;
Expand All @@ -70,7 +70,8 @@ static bool is_weight_dyn_quantizable(const fully_connected_params& params) {
auto weight_type = params.weights.GetDType();
if (weight_type == WeightsType::INT4 || weight_type == WeightsType::UINT4)
return true;
if (is_dyn_quan_8bit_asym(params))
// No validated case of sym 8bit weight
if (is_8bit_asym_wei(params))
return true;

return false;
Expand Down Expand Up @@ -121,7 +122,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
dynamic_quantization_group_size = scale_group_size;

// For int8 ASYM model, activation_sum should fit to weight zp
if (is_dyn_quan_8bit_asym(params) && params.has_decompression_zp == true &&
if (is_8bit_asym_wei(params) && params.has_decompression_zp == true &&
dynamic_quantization_group_size > zp_group_size && (zp_group_size % input_load_size) == 0) {
dynamic_quantization_group_size = zp_group_size;
}
Expand Down Expand Up @@ -753,10 +754,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));

if (is_per_token_dynamic_quantize(params) && quantize_grp_size == get_input_bf_size(params).second)
jit.AddConstant(MakeJitConstant("PER_TOKEN_SIZE_DYN_QUANTIZE", 1));
else
jit.AddConstant(MakeJitConstant("PER_TOKEN_SIZE_DYN_QUANTIZE", 0));
jit.AddConstant(MakeJitConstant("PER_TOKEN_SIZE_DYN_QUANTIZE",
is_per_token_dynamic_quantize(params) && quantize_grp_size == get_input_bf_size(params).second));
} else {
if (add_decompress_scale_post_op)
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
Expand Down Expand Up @@ -897,7 +896,8 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
size_t input_f = get_input_bf_size(prim_params).second;
size_t input_size = input_f * dispatchData.tile_m * dispatchData.gws[2];
OPENVINO_ASSERT(quantize_grp_size != 0, "Error: quantize_grp_size is zero.");
size_t quan_var_size = (input_size / quantize_grp_size) * sizeof(float) * 2;
// half type of de_quan_scale and activation sum for each quantized group
size_t quan_var_size = (input_size / quantize_grp_size) * 2 * 2;

if (kd.internalBufferSizes[0] < input_size ||
kd.internalBufferSizes[1] < quan_var_size) {
Expand Down Expand Up @@ -1117,8 +1117,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
// char type quantized input
kd.internalBufferSizes.push_back(input_size);
// float type of de_quan_scale and activation sum for each quantized group
kd.internalBufferSizes.push_back((input_size / quantize_grp_size) * sizeof(float) * 2);
// half type of de_quan_scale and activation sum for each quantized group
kd.internalBufferSizes.push_back((input_size / quantize_grp_size) * 2 * 2);
kernel_number++;
}
kd.internalBufferDataType = Datatype::F16;
Expand Down

0 comments on commit 75e1dda

Please sign in to comment.