From 20abadab2fd088aca8c00149d1d9d22a60e3045d Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Fri, 26 Jan 2024 15:49:09 +0400 Subject: [PATCH] [GPU] Add optimization for FC for beam search and beams number < 8 (#22335) * [GPU] Add optimization for FC for beam search and beams number < 8 * Re-enable async compilation for batch_size==1 --- .../intel_gpu/src/graph/primitive_inst.cpp | 19 +- .../fully_connected_gpu_bf_tiled.cl | 518 +++++------------- .../fully_connected_gpu_bf_tiled_common.cl | 390 +++++++++++++ .../kernel_selector/kernel_base_opencl.cpp | 1 + .../fully_connected_kernel_bf_tiled.cpp | 8 +- .../src/kernel_selector/primitive_db_gen.py | 9 +- 6 files changed, 565 insertions(+), 380 deletions(-) create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 0ab59a351f50ff..195124f15e7937 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -685,7 +685,24 @@ bool primitive_inst::use_async_compilation() { return false; } - return (_node->is_type() || _node->is_type() || _node->is_type() || + bool compile_fc_impls = _node->is_type(); + if (compile_fc_impls) { + const auto& fc_node = _node->as(); + if (fc_node.get_primitive()->compressed_weights) { + auto weights_dt = fc_node.weights().get_output_layout().data_type; + auto input_shape = _impl_params->get_input_layout().get_shape(); + auto batch_size = std::accumulate(input_shape.begin(), + input_shape.end() - 1, + size_t{1}, + std::multiplies()); + + // Disable async compilation for all int4 FC, except in the case of batch_size == 1 + if (one_of(weights_dt, {data_types::i4, data_types::u4}) && batch_size != 1) + compile_fc_impls = false; + } + } + + return (_node->is_type() || compile_fc_impls || _node->is_type() || (_node->is_type() && _node->get_selected_impl() && _node->get_selected_impl()->get_kernel_name().find("softmax_gpu_ref") != std::string::npos)); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 76200dc2e17812..fe23ffb9d90a32 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -89,388 +89,37 @@ # define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT #endif -inline void FUNC(fc_bf_tiled_kernel_tile_b1)( - OPTIONAL_SHAPE_INFO_ARG - const __global INPUT0_TYPE* input, -#if DECOMPRESSION_SCALE_TERM - const __global DECOMPRESSION_SCALE_TYPE* decompression_scale, -#endif -#if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR - const __global DECOMPRESSION_ZP_TYPE* decompression_zp, -#endif - __global OUTPUT_TYPE* output, - const __global FILTER_TYPE* weights -#if BIAS_TERM - , const __global BIAS_TYPE* biases -#endif -#if HAS_FUSED_OPS_DECLS - , FUSED_OPS_DECLS -#endif -) { - #define FORCED_TILE_B 1 - uint gid = (uint)get_group_id(0); - uint sglid = (uint)get_sub_group_local_id(); - - // Dispatch as bs_fs_bsv_fsv, where bsv = DISPATCH_BSV and fsv = DISPATCH_FSV. - // This allows more fine grained control over dispatch order than using work-groups and - // avoids requirement of threads being available for whole work-group. - // It could hovewer have some drawbacks like not providing physical locality or not using - // full dispatch pipeline. - uint feature_mini_block = gid % DISPATCH_FSV; - uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV; - uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); - uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); - - uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (TILE_OFM * SIMD); - uint out_b = ((batch_mega_block * DISPATCH_BSV + batch_mini_block) * FORCED_TILE_B); - - ACCUMULATOR_VEC_TYPE acc[FORCED_TILE_B] = { }; - INPUT_VEC_TYPE in_0[FORCED_TILE_B] = { }; - - FILTER_VEC_TYPE wei = 0; - uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET; -#if COMPRESSED_WEIGHTS_INT4 - uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2); -#else - uint weights_offset = out_f * INPUT_ELEMENTS_COUNT; -#endif - -#if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1 - #if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_scale = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_scale, out_f); - #elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0 - ACCUMULATOR_VEC_TYPE d_scale = 0; - unroll_for(uint of = 0; of < TILE_OFM; ++of) { - uint offset = out_f + of*SIMD + get_sub_group_local_id(); - if (offset < DECOMPRESSION_SCALE_LENGTH) - ((ACCUMULATOR_TYPE*)(&d_scale))[of] = decompression_scale[offset]; - } - #else - ACCUMULATOR_VEC_TYPE d_scale = decompression_scale[0]; - #endif - - ACCUMULATOR_TYPE* d_scales = (ACCUMULATOR_TYPE*)(&d_scale); -#endif - -#if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR - #if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_zp = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_zp, out_f); - #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0 - ACCUMULATOR_VEC_TYPE d_zp = 0; - unroll_for(uint of = 0; of < TILE_OFM; ++of) { - uint offset = out_f + of*SIMD + get_sub_group_local_id(); - if (offset < DECOMPRESSION_ZP_LENGTH) - ((ACCUMULATOR_TYPE*)(&d_zp))[of] = decompression_zp[offset]; - } - #else - ACCUMULATOR_VEC_TYPE d_zp = decompression_zp[0]; - #endif - ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp); -#endif - -#if REALIGN_FP16_OFFSET - // For fp16 we need to ensure that all block reads are aligned to 4 byte (2 words) boundary. - // To do this solve first input feature separately. - { - INPUT0_TYPE tmp_input = input[input_offset + get_sub_group_local_id() % FORCED_TILE_B * TILE_IN_B_PITCH]; - ACCUMULATOR_VEC_TYPE tmp_wei = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(FILTER_TYPE, TILE_OFM, weights, weights_offset)); - #if COMPRESSED_WEIGHTS - tmp_wei = (tmp_wei - d_zp) * d_scale; - #endif - unroll_for(uint bi = 0; bi < FORCED_TILE_B; ++bi) { - acc[bi] = _sub_group_shuffle(tmp_input, bi) * tmp_wei; - } - - weights_offset += TILE_OFM * SIMD; - input_offset += 1; - } -#endif - // ===================================================================================================================================== - // Main computation loop - uint iterations = MAIN_LOOP_ELEMENTS_COUNT / (TILE_IFM * SIMD); - __attribute__((opencl_unroll_hint(1))) - for (uint ni = 0; ni < iterations; ++ni) { - // Load input. - #define LOAD_IN_0(bi) do { \ - in_0[bi] = INPUT_BLOCK_READ(input, input_offset); \ - input_offset += TILE_IN_B_PITCH; \ - } while (false) - - CONST_LOOP(FORCED_TILE_B, LOAD_IN_0); - #undef LOAD_IN_0 - input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * FORCED_TILE_B; - // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes, - // but significantly degrades readability and generality of code. - // It doesn't also show noticable performance improvement on tested configurations. - #if DECOMPRESSION_SCALE_POST_OP - ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { }; - #endif - - unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) { - #if COMPRESSED_WEIGHTS_INT4 - FILTER_PACKED_VEC_TYPE wei_packed = FILTER_BLOCK_READ(weights, weights_offset); - wei = UNPACK_INT4x2(ACCUMULATOR_TYPE, *((INT4_PACKED_TYPE*)&wei_packed)); - #else - wei = TO_FILTER_VEC_TYPE(FILTER_BLOCK_READ(weights, weights_offset)); - #endif - - #if COMPRESSED_WEIGHTS - ACCUMULATOR_TYPE* w = (ACCUMULATOR_TYPE*)(&wei); - unroll_for(uint kii = 0; kii < TILE_K; ++kii) { - unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { - const uint w_idx = kii * TILE_OFM + fi; - const uint offset_ofm = out_f + fi*SIMD + sglid; - #if !DECOMPRESSION_SCALE_POST_OP - // Apply scales before FMA to avoid FP16 overflow in case of INT8 - #if DECOMPRESSION_SCALE_GROUPS_NUM > 1 - const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH + - ((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; - ACCUMULATOR_TYPE ds = decompression_scale[scale_offset]; - #else - ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; - #endif - #else - ACCUMULATOR_TYPE ds = ACCUMULATOR_VAL_ONE; - #endif - - #if DECOMPRESSION_ZP_TERM - #if DECOMPRESSION_ZP_SCALAR - ACCUMULATOR_TYPE dzp = DECOMPRESSION_ZP_VALUE; - #elif DECOMPRESSION_ZP_GROUPS_NUM > 1 - const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH + - ((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH; - ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset]; - #else - ACCUMULATOR_TYPE dzp = d_zps[fi % DECOMPRESSION_ZP_LENGTH]; - #endif - #else - ACCUMULATOR_TYPE dzp = ACCUMULATOR_VAL_ZERO; - #endif - w[w_idx] = (w[w_idx] - dzp) * ds; - } - } - #endif - weights_offset += TILE_K_OFM_PACKED * SIMD; - - unroll_for (uint kii = 0; kii < TILE_K; ++kii) { - const uint total_k = ki * TILE_K + kii; - unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { - INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD); - unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) { -#if DECOMPRESSION_SCALE_POST_OP - ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[kii * TILE_OFM + fi]; -#else - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[kii * TILE_OFM + fi]; -#endif - } - } - } -#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM * SIMD > DECOMPRESSION_SCALE_GROUP_SIZE) - unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { - unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { - const uint offset_ofm = out_f + fi*SIMD + sglid; - - #if DECOMPRESSION_SCALE_GROUPS_NUM > 1 - const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH + - ((ni*TILE_IFM*SIMD + ki*TILE_K) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; - ACCUMULATOR_TYPE ds = decompression_scale[scale_offset]; - #else - ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; - #endif - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds; - acc_tmp[bi][fi] = 0; - } - } -#endif - } -#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM * SIMD <= DECOMPRESSION_SCALE_GROUP_SIZE) - unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { - unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { - const uint offset_ofm = out_f + fi*SIMD + sglid; - - #if DECOMPRESSION_SCALE_GROUPS_NUM > 1 - const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH + - ((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; - ACCUMULATOR_TYPE ds = decompression_scale[scale_offset]; - #else - ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; - #endif - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds; - } - } -#endif - } - // ===================================================================================================================================== - // Leftovers -#if MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD) != 0 - // Handle leftovers in normal case without alignment correction. - #define LEFTOVER_IFM (MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD)) - { - #define LOAD_IN_0(bi) do { \ - in_0[bi] = INPUT_BLOCK_READ(input, input_offset); \ - input_offset += TILE_IN_B_PITCH; \ - } while (false) - - CONST_LOOP(FORCED_TILE_B, LOAD_IN_0); - #undef LOAD_IN_0 - input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * FORCED_TILE_B; - unroll_for(uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) { - #if COMPRESSED_WEIGHTS_INT4 - FILTER_PACKED_VEC_TYPE wei_packed = FILTER_BLOCK_READ(weights, weights_offset); - wei = UNPACK_INT4x2(ACCUMULATOR_TYPE, *((INT4_PACKED_TYPE*)&wei_packed)); - #else - wei = TO_FILTER_VEC_TYPE(FILTER_BLOCK_READ(weights, weights_offset)); - #endif - - #if COMPRESSED_WEIGHTS - ACCUMULATOR_TYPE* w = (ACCUMULATOR_TYPE*)(&wei); - unroll_for(uint kii = 0; kii < TILE_K; ++kii) { - unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { - const uint w_idx = kii * TILE_OFM + fi; - uint offset_ofm = out_f + fi*SIMD + get_sub_group_local_id(); - #if DECOMPRESSION_SCALE_GROUPS_NUM > 1 - const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH + - ((kii + ki*TILE_K + iterations*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; - ACCUMULATOR_TYPE ds = decompression_scale[scale_offset]; - #else - ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; - #endif +#if IS_DYNAMIC && COMPRESSED_WEIGHTS_INT4 +#pragma disable_includes_optimization +#define FORCED_TILE_B 1 +#include "include/fully_connected_gpu_bf_tiled_common.cl" +#undef FORCED_TILE_B - #if DECOMPRESSION_ZP_TERM - #if DECOMPRESSION_ZP_SCALAR - ACCUMULATOR_TYPE dzp = DECOMPRESSION_ZP_VALUE; - #elif DECOMPRESSION_ZP_GROUPS_NUM > 1 - const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH + - ((kii + ki*TILE_K + iterations*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH; - ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset]; - #else - ACCUMULATOR_TYPE dzp = d_zps[fi % DECOMPRESSION_ZP_LENGTH]; - #endif - #else - ACCUMULATOR_TYPE dzp = ACCUMULATOR_VAL_ZERO; - #endif - w[w_idx] = (w[w_idx] - dzp) * ds; - } - } - #endif - weights_offset += TILE_K_OFM_PACKED * SIMD; +#define FORCED_TILE_B 2 +#include "include/fully_connected_gpu_bf_tiled_common.cl" +#undef FORCED_TILE_B - unroll_for (uint kii = 0; kii < TILE_K; ++kii) { - unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) { - unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { - const uint total_k = ki * TILE_K + kii; - if (total_k < LEFTOVER_IFM) { - INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD); - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[kii * TILE_OFM + fi]; - } - } - } - } - } - } - #undef LEFTOVER_IFM -#endif // MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD) != 0 - // ===================================================================================================================================== - // Post-processing: bias, activation, fused-ops - ACTIVATION_VEC_TYPE activated[FORCED_TILE_B] = { }; - for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { - activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); - } +#define FORCED_TILE_B 3 +#include "include/fully_connected_gpu_bf_tiled_common.cl" +#undef FORCED_TILE_B -#if BIAS_TERM - #if TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 - BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f); - #else - BIAS_VEC_TYPE bias = 0; - unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { - ((BIAS_TYPE*)(&bias))[fi] = biases[out_f + sglid + fi * SIMD]; - } - #endif - unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { - activated[bi] += TO_ACTIVATION_VEC_TYPE(bias); - } -#endif +#define FORCED_TILE_B 4 +#include "include/fully_connected_gpu_bf_tiled_common.cl" +#undef FORCED_TILE_B - OUTPUT_VEC_TYPE result[FORCED_TILE_B] = { }; -#if HAS_FUSED_OPS - unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { - #if TILE_OFM > 1 - unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { - FUSED_OPS_VEC; - result[bi][fi] = FUSED_OPS_RESULT_VEC; - } - #else - FUSED_OPS_SCALAR; - result[bi] = FUSED_OPS_RESULT_SCALAR; - #endif // TILE_OFM > 1 - } -#else - unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { - result[bi] = TO_OUTPUT_VEC_TYPE(ACTIVATION_TYPED(activated[bi], ACTIVATION_PARAMS_TYPED)); - } -#endif - // ===================================================================================================================================== - // Write results - uint output_offset = out_f * TILE_OUT_F_PITCH + out_b * TILE_OUT_B_PITCH + OUTPUT_OFFSET; - - if (USE_BLOCK_WRITE && (TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || out_f + (TILE_OFM * SIMD) <= TILE_OUT_F_NUM)) { -#if IS_DYNAMIC - #define WRITE_OUTPUT(bi) do { \ - if (bi + out_b < BATCH_SIZE) \ - OUTPUT_BLOCK_WRITE(output, output_offset, result[bi]); \ - output_offset += TILE_OUT_B_PITCH; \ - } while (false) -#else - #define WRITE_OUTPUT(bi) do { \ - OUTPUT_BLOCK_WRITE(output, output_offset, result[bi]); \ - output_offset += TILE_OUT_B_PITCH; \ - } while (false) -#endif - CONST_LOOP(FORCED_TILE_B, WRITE_OUTPUT); - #undef WRITE_OUTPUT - } else { - output_offset += sglid; +#define FORCED_TILE_B 5 +#include "include/fully_connected_gpu_bf_tiled_common.cl" +#undef FORCED_TILE_B - // TODO: Investigate why below code doesn't compile and check how it affects performance. - //#define WRITE_OUTPUT_FEATURE(fi) do { \ - // const bool should_write = \ - // TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || \ - // out_f + (fi) * SIMD + sglid < TILE_OUT_F_NUM; \ - // if (should_write) { \ - // output[output_offset] = result[out_bi][fi]; \ - // } \ - // output_offset += SIMD; \ - // } while (false) - // - //#define WRITE_OUTPUT(bi) do { \ - // const uint out_bi = bi; \ - // CONST_LOOP(TILE_OFM, WRITE_OUTPUT_FEATURE); \ - // output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD; \ - // } while (false) - // - //CONST_LOOP(FORCED_TILE_B, WRITE_OUTPUT); - //#undef WRITE_OUTPUT - //#undef WRITE_OUTPUT_FEATURE +#define FORCED_TILE_B 6 +#include "include/fully_connected_gpu_bf_tiled_common.cl" +#undef FORCED_TILE_B - for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { - for (uint fi = 0; fi < TILE_OFM; ++fi) { - const bool should_write = -#if IS_DYNAMIC - bi + out_b < BATCH_SIZE && +#define FORCED_TILE_B 7 +#include "include/fully_connected_gpu_bf_tiled_common.cl" +#undef FORCED_TILE_B +#pragma enable_includes_optimization #endif - (TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || - out_f + fi * SIMD + sglid < TILE_OUT_F_NUM); - if (should_write) { - output[output_offset] = ((OUTPUT_TYPE*)(&result[bi]))[fi]; - } - output_offset += SIMD; - } - output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD; - } - } - // ===================================================================================================================================== - #undef FORCED_TILE_B -} inline void FUNC(fc_bf_tiled_kernel_default)( OPTIONAL_SHAPE_INFO_ARG @@ -1002,8 +651,123 @@ KERNEL(fc)( __local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD]; #endif #if IS_DYNAMIC && COMPRESSED_WEIGHTS_INT4 - if (BATCH_SIZE == 1) { - FUNC_CALL(fc_bf_tiled_kernel_tile_b1)( + const int batch_size = BATCH_SIZE; + if (batch_size == 1) { + FUNC_CALL(fc_bf_tiled_kernel_forced_tile_b1)( + OPTIONAL_SHAPE_INFO_TENSOR + input, + #if DECOMPRESSION_SCALE_TERM + decompression_scale, + #endif + #if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + decompression_zp, + #endif + output, + weights + #if BIAS_TERM + , biases + #endif + #if HAS_FUSED_OPS_DECLS + , FUSED_OPS_ARGS + #endif + ); + } else if (batch_size == 2) { + FUNC_CALL(fc_bf_tiled_kernel_forced_tile_b2)( + OPTIONAL_SHAPE_INFO_TENSOR + input, + #if DECOMPRESSION_SCALE_TERM + decompression_scale, + #endif + #if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + decompression_zp, + #endif + output, + weights + #if BIAS_TERM + , biases + #endif + #if HAS_FUSED_OPS_DECLS + , FUSED_OPS_ARGS + #endif + ); + } else if (batch_size == 3) { + FUNC_CALL(fc_bf_tiled_kernel_forced_tile_b3)( + OPTIONAL_SHAPE_INFO_TENSOR + input, + #if DECOMPRESSION_SCALE_TERM + decompression_scale, + #endif + #if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + decompression_zp, + #endif + output, + weights + #if BIAS_TERM + , biases + #endif + #if HAS_FUSED_OPS_DECLS + , FUSED_OPS_ARGS + #endif + ); + } else if (batch_size == 4) { + FUNC_CALL(fc_bf_tiled_kernel_forced_tile_b4)( + OPTIONAL_SHAPE_INFO_TENSOR + input, + #if DECOMPRESSION_SCALE_TERM + decompression_scale, + #endif + #if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + decompression_zp, + #endif + output, + weights + #if BIAS_TERM + , biases + #endif + #if HAS_FUSED_OPS_DECLS + , FUSED_OPS_ARGS + #endif + ); + } else if (batch_size == 5) { + FUNC_CALL(fc_bf_tiled_kernel_forced_tile_b5)( + OPTIONAL_SHAPE_INFO_TENSOR + input, + #if DECOMPRESSION_SCALE_TERM + decompression_scale, + #endif + #if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + decompression_zp, + #endif + output, + weights + #if BIAS_TERM + , biases + #endif + #if HAS_FUSED_OPS_DECLS + , FUSED_OPS_ARGS + #endif + ); + } else if (batch_size == 6) { + FUNC_CALL(fc_bf_tiled_kernel_forced_tile_b6)( + OPTIONAL_SHAPE_INFO_TENSOR + input, + #if DECOMPRESSION_SCALE_TERM + decompression_scale, + #endif + #if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + decompression_zp, + #endif + output, + weights + #if BIAS_TERM + , biases + #endif + #if HAS_FUSED_OPS_DECLS + , FUSED_OPS_ARGS + #endif + ); + } else if (batch_size == 7) { + FUNC_CALL(fc_bf_tiled_kernel_forced_tile_b7)( OPTIONAL_SHAPE_INFO_TENSOR input, #if DECOMPRESSION_SCALE_TERM diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl new file mode 100644 index 00000000000000..98fe1d1082d3c8 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl @@ -0,0 +1,390 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#define FUNC_NAME_BATCHED CAT(fc_bf_tiled_kernel_forced_tile_b, FORCED_TILE_B) +#define FUNC_NAME CAT(_, CAT(CAT(FUNC_NAME_BATCHED, _), KERNEL_ID)) + +inline void (FUNC_NAME)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, +#if DECOMPRESSION_SCALE_TERM + const __global DECOMPRESSION_SCALE_TYPE* decompression_scale, +#endif +#if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + const __global DECOMPRESSION_ZP_TYPE* decompression_zp, +#endif + __global OUTPUT_TYPE* output, + const __global FILTER_TYPE* weights +#if BIAS_TERM + , const __global BIAS_TYPE* biases +#endif +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) { + uint gid = (uint)get_group_id(0); + uint sglid = (uint)get_sub_group_local_id(); + + // Dispatch as bs_fs_bsv_fsv, where bsv = DISPATCH_BSV and fsv = DISPATCH_FSV. + // This allows more fine grained control over dispatch order than using work-groups and + // avoids requirement of threads being available for whole work-group. + // It could hovewer have some drawbacks like not providing physical locality or not using + // full dispatch pipeline. + uint feature_mini_block = gid % DISPATCH_FSV; + uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV; + uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); + uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV); + + uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (TILE_OFM * SIMD); + uint out_b = ((batch_mega_block * DISPATCH_BSV + batch_mini_block) * FORCED_TILE_B); + + ACCUMULATOR_VEC_TYPE acc[FORCED_TILE_B] = { }; + INPUT_VEC_TYPE in_0[FORCED_TILE_B] = { }; + + FILTER_VEC_TYPE wei = 0; + uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET; +#if COMPRESSED_WEIGHTS_INT4 + uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2); +#else + uint weights_offset = out_f * INPUT_ELEMENTS_COUNT; +#endif + +#if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1 + #if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0 + ACCUMULATOR_VEC_TYPE d_scale = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_scale, out_f); + #elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0 + ACCUMULATOR_VEC_TYPE d_scale = 0; + unroll_for(uint of = 0; of < TILE_OFM; ++of) { + uint offset = out_f + of*SIMD + get_sub_group_local_id(); + if (offset < DECOMPRESSION_SCALE_LENGTH) + ((ACCUMULATOR_TYPE*)(&d_scale))[of] = decompression_scale[offset]; + } + #else + ACCUMULATOR_VEC_TYPE d_scale = decompression_scale[0]; + #endif + + ACCUMULATOR_TYPE* d_scales = (ACCUMULATOR_TYPE*)(&d_scale); +#endif + +#if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR + #if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0 + ACCUMULATOR_VEC_TYPE d_zp = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_zp, out_f); + #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0 + ACCUMULATOR_VEC_TYPE d_zp = 0; + unroll_for(uint of = 0; of < TILE_OFM; ++of) { + uint offset = out_f + of*SIMD + get_sub_group_local_id(); + if (offset < DECOMPRESSION_ZP_LENGTH) + ((ACCUMULATOR_TYPE*)(&d_zp))[of] = decompression_zp[offset]; + } + #else + ACCUMULATOR_VEC_TYPE d_zp = decompression_zp[0]; + #endif + ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp); +#endif + +#if REALIGN_FP16_OFFSET + // For fp16 we need to ensure that all block reads are aligned to 4 byte (2 words) boundary. + // To do this solve first input feature separately. + { + INPUT0_TYPE tmp_input = input[input_offset + get_sub_group_local_id() % FORCED_TILE_B * TILE_IN_B_PITCH]; + ACCUMULATOR_VEC_TYPE tmp_wei = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(FILTER_TYPE, TILE_OFM, weights, weights_offset)); + #if COMPRESSED_WEIGHTS + tmp_wei = (tmp_wei - d_zp) * d_scale; + #endif + unroll_for(uint bi = 0; bi < FORCED_TILE_B; ++bi) { + acc[bi] = _sub_group_shuffle(tmp_input, bi) * tmp_wei; + } + + weights_offset += TILE_OFM * SIMD; + input_offset += 1; + } +#endif + // ===================================================================================================================================== + // Main computation loop + uint iterations = MAIN_LOOP_ELEMENTS_COUNT / (TILE_IFM * SIMD); + __attribute__((opencl_unroll_hint(1))) + for (uint ni = 0; ni < iterations; ++ni) { + // Load input. + #define LOAD_IN_0(bi) do { \ + in_0[bi] = INPUT_BLOCK_READ(input, input_offset); \ + input_offset += TILE_IN_B_PITCH; \ + } while (false) + + CONST_LOOP(FORCED_TILE_B, LOAD_IN_0); + #undef LOAD_IN_0 + input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * FORCED_TILE_B; + // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes, + // but significantly degrades readability and generality of code. + // It doesn't also show noticable performance improvement on tested configurations. + #if DECOMPRESSION_SCALE_POST_OP + ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { }; + #endif + + unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) { + #if COMPRESSED_WEIGHTS_INT4 + FILTER_PACKED_VEC_TYPE wei_packed = FILTER_BLOCK_READ(weights, weights_offset); + wei = UNPACK_INT4x2(ACCUMULATOR_TYPE, *((INT4_PACKED_TYPE*)&wei_packed)); + #else + wei = TO_FILTER_VEC_TYPE(FILTER_BLOCK_READ(weights, weights_offset)); + #endif + + #if COMPRESSED_WEIGHTS + ACCUMULATOR_TYPE* w = (ACCUMULATOR_TYPE*)(&wei); + unroll_for(uint kii = 0; kii < TILE_K; ++kii) { + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + const uint w_idx = kii * TILE_OFM + fi; + const uint offset_ofm = out_f + fi*SIMD + sglid; + #if !DECOMPRESSION_SCALE_POST_OP + // Apply scales before FMA to avoid FP16 overflow in case of INT8 + #if DECOMPRESSION_SCALE_GROUPS_NUM > 1 + const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH + + ((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; + ACCUMULATOR_TYPE ds = decompression_scale[scale_offset]; + #else + ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; + #endif + #else + ACCUMULATOR_TYPE ds = ACCUMULATOR_VAL_ONE; + #endif + + #if DECOMPRESSION_ZP_TERM + #if DECOMPRESSION_ZP_SCALAR + ACCUMULATOR_TYPE dzp = DECOMPRESSION_ZP_VALUE; + #elif DECOMPRESSION_ZP_GROUPS_NUM > 1 + const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH + + ((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH; + ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset]; + #else + ACCUMULATOR_TYPE dzp = d_zps[fi % DECOMPRESSION_ZP_LENGTH]; + #endif + #else + ACCUMULATOR_TYPE dzp = ACCUMULATOR_VAL_ZERO; + #endif + w[w_idx] = (w[w_idx] - dzp) * ds; + } + } + #endif + weights_offset += TILE_K_OFM_PACKED * SIMD; + + unroll_for (uint kii = 0; kii < TILE_K; ++kii) { + const uint total_k = ki * TILE_K + kii; + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD); + unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) { +#if DECOMPRESSION_SCALE_POST_OP + ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[kii * TILE_OFM + fi]; +#else + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[kii * TILE_OFM + fi]; +#endif + } + } + } +#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM * SIMD > DECOMPRESSION_SCALE_GROUP_SIZE) + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + const uint offset_ofm = out_f + fi*SIMD + sglid; + + #if DECOMPRESSION_SCALE_GROUPS_NUM > 1 + const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH + + ((ni*TILE_IFM*SIMD + ki*TILE_K) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; + ACCUMULATOR_TYPE ds = decompression_scale[scale_offset]; + #else + ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; + #endif + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds; + acc_tmp[bi][fi] = 0; + } + } +#endif + } +#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM * SIMD <= DECOMPRESSION_SCALE_GROUP_SIZE) + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + const uint offset_ofm = out_f + fi*SIMD + sglid; + + #if DECOMPRESSION_SCALE_GROUPS_NUM > 1 + const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH + + ((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; + ACCUMULATOR_TYPE ds = decompression_scale[scale_offset]; + #else + ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; + #endif + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds; + } + } +#endif + } + // ===================================================================================================================================== + // Leftovers +#if MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD) != 0 + // Handle leftovers in normal case without alignment correction. + #define LEFTOVER_IFM (MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD)) + { + #define LOAD_IN_0(bi) do { \ + in_0[bi] = INPUT_BLOCK_READ(input, input_offset); \ + input_offset += TILE_IN_B_PITCH; \ + } while (false) + + CONST_LOOP(FORCED_TILE_B, LOAD_IN_0); + #undef LOAD_IN_0 + input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * FORCED_TILE_B; + unroll_for(uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) { + #if COMPRESSED_WEIGHTS_INT4 + FILTER_PACKED_VEC_TYPE wei_packed = FILTER_BLOCK_READ(weights, weights_offset); + wei = UNPACK_INT4x2(ACCUMULATOR_TYPE, *((INT4_PACKED_TYPE*)&wei_packed)); + #else + wei = TO_FILTER_VEC_TYPE(FILTER_BLOCK_READ(weights, weights_offset)); + #endif + + #if COMPRESSED_WEIGHTS + ACCUMULATOR_TYPE* w = (ACCUMULATOR_TYPE*)(&wei); + unroll_for(uint kii = 0; kii < TILE_K; ++kii) { + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + const uint w_idx = kii * TILE_OFM + fi; + uint offset_ofm = out_f + fi*SIMD + get_sub_group_local_id(); + #if DECOMPRESSION_SCALE_GROUPS_NUM > 1 + const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH + + ((kii + ki*TILE_K + iterations*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; + ACCUMULATOR_TYPE ds = decompression_scale[scale_offset]; + #else + ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; + #endif + + #if DECOMPRESSION_ZP_TERM + #if DECOMPRESSION_ZP_SCALAR + ACCUMULATOR_TYPE dzp = DECOMPRESSION_ZP_VALUE; + #elif DECOMPRESSION_ZP_GROUPS_NUM > 1 + const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH + + ((kii + ki*TILE_K + iterations*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH; + ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset]; + #else + ACCUMULATOR_TYPE dzp = d_zps[fi % DECOMPRESSION_ZP_LENGTH]; + #endif + #else + ACCUMULATOR_TYPE dzp = ACCUMULATOR_VAL_ZERO; + #endif + w[w_idx] = (w[w_idx] - dzp) * ds; + } + } + #endif + weights_offset += TILE_K_OFM_PACKED * SIMD; + + unroll_for (uint kii = 0; kii < TILE_K; ++kii) { + unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) { + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + const uint total_k = ki * TILE_K + kii; + if (total_k < LEFTOVER_IFM) { + INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD); + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[kii * TILE_OFM + fi]; + } + } + } + } + } + } + #undef LEFTOVER_IFM +#endif // MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD) != 0 + // ===================================================================================================================================== + // Post-processing: bias, activation, fused-ops + ACTIVATION_VEC_TYPE activated[FORCED_TILE_B] = { }; + for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]); + } + +#if BIAS_TERM + #if TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 + BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f); + #else + BIAS_VEC_TYPE bias = 0; + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + ((BIAS_TYPE*)(&bias))[fi] = biases[out_f + sglid + fi * SIMD]; + } + #endif + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + activated[bi] += TO_ACTIVATION_VEC_TYPE(bias); + } +#endif + + OUTPUT_VEC_TYPE result[FORCED_TILE_B] = { }; +#if HAS_FUSED_OPS + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + #if TILE_OFM > 1 + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + FUSED_OPS_VEC; + result[bi][fi] = FUSED_OPS_RESULT_VEC; + } + #else + FUSED_OPS_SCALAR; + result[bi] = FUSED_OPS_RESULT_SCALAR; + #endif // TILE_OFM > 1 + } +#else + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + result[bi] = TO_OUTPUT_VEC_TYPE(ACTIVATION_TYPED(activated[bi], ACTIVATION_PARAMS_TYPED)); + } +#endif + // ===================================================================================================================================== + // Write results + uint output_offset = out_f * TILE_OUT_F_PITCH + out_b * TILE_OUT_B_PITCH + OUTPUT_OFFSET; + + if (USE_BLOCK_WRITE && (TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || out_f + (TILE_OFM * SIMD) <= TILE_OUT_F_NUM)) { +#if IS_DYNAMIC + #define WRITE_OUTPUT(bi) do { \ + if (bi + out_b < BATCH_SIZE) \ + OUTPUT_BLOCK_WRITE(output, output_offset, result[bi]); \ + output_offset += TILE_OUT_B_PITCH; \ + } while (false) +#else + #define WRITE_OUTPUT(bi) do { \ + OUTPUT_BLOCK_WRITE(output, output_offset, result[bi]); \ + output_offset += TILE_OUT_B_PITCH; \ + } while (false) +#endif + CONST_LOOP(FORCED_TILE_B, WRITE_OUTPUT); + #undef WRITE_OUTPUT + } else { + output_offset += sglid; + + // TODO: Investigate why below code doesn't compile and check how it affects performance. + //#define WRITE_OUTPUT_FEATURE(fi) do { \ + // const bool should_write = \ + // TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || \ + // out_f + (fi) * SIMD + sglid < TILE_OUT_F_NUM; \ + // if (should_write) { \ + // output[output_offset] = result[out_bi][fi]; \ + // } \ + // output_offset += SIMD; \ + // } while (false) + // + //#define WRITE_OUTPUT(bi) do { \ + // const uint out_bi = bi; \ + // CONST_LOOP(TILE_OFM, WRITE_OUTPUT_FEATURE); \ + // output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD; \ + // } while (false) + // + //CONST_LOOP(FORCED_TILE_B, WRITE_OUTPUT); + //#undef WRITE_OUTPUT + //#undef WRITE_OUTPUT_FEATURE + + for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + for (uint fi = 0; fi < TILE_OFM; ++fi) { + const bool should_write = +#if IS_DYNAMIC + bi + out_b < BATCH_SIZE && +#endif + (TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || + out_f + fi * SIMD + sglid < TILE_OUT_F_NUM); + if (should_write) { + output[output_offset] = ((OUTPUT_TYPE*)(&result[bi]))[fi]; + } + output_offset += SIMD; + } + output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD; + } + } + // ===================================================================================================================================== +} + +#undef FUNC_NAME_BATCHED +#undef FUNC_NAME diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp index b382561afdac34..0a5655a4996cff 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp @@ -96,6 +96,7 @@ std::pair KernelBaseOpenCL::CreateJit(const std::strin .add_line("// Kernel template: " + template_name + " ") .add_line("// Kernel name: " + kernel_id) .value_macro("KERNEL(name)", "__kernel void " + kernel_id) + .value_macro("KERNEL_ID", kernel_id) .decoration_macro("FUNC", "", kernel_id) .decoration_macro("FUNC_CALL", "", kernel_id) .decoration_macro("CONST_ARRAY_DECL", "__constant size_t ", kernel_id + " []") diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 926dcf2f65a1a5..841da3779c3289 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -437,7 +437,13 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("DISPATCH_BSV", dispatchData.tile_ms)); jit.AddConstant(MakeJitConstant("DISPATCH_FSV", dispatchData.tile_ns)); - jit.Merge(MakeConstantLoopUnrollJitConstants(dispatchData.tile_m)); + auto max_tile_b_size = dispatchData.tile_m; + if (params.compressed && + params.is_shape_agnostic && + (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4)) + max_tile_b_size = std::max(max_tile_b_size, (uint32_t)8); + + jit.Merge(MakeConstantLoopUnrollJitConstants(max_tile_b_size)); bool realign_fp16_offset = params.inputs[0].GetDType() == Datatype::F16 && params.inputs[0].GetFirstElementOffset() % 2 != 0; jit.AddConstant(MakeJitConstant("REALIGN_FP16_OFFSET", realign_fp16_offset)); diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py index 6fc86c8c912b76..eff4c3d0033034 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py +++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py @@ -168,13 +168,20 @@ def append_file_content(self, filename, origin_file): with open(filename) as f: content += f.readlines() + optimize_includes = True for line in content: + if line.startswith('#pragma'): + if "enable_includes_optimization" in line: + optimize_includes = True + elif "disable_includes_optimization" in line: + optimize_includes = False + if line.startswith('#include'): include_file_name = line.strip().split('"')[1].strip() if ntpath.basename(include_file_name) in self.batch_headers: continue full_path_include = os.path.abspath(os.path.join(os.path.dirname(filename), include_file_name)) - if full_path_include not in self.include_files[origin_file]: + if full_path_include not in self.include_files[origin_file] or not optimize_includes: self.include_files[origin_file][full_path_include] = True res += self.append_file_content(full_path_include, origin_file) res += "\n"