From 158ab15f4bce2f8ca81c1f995d4eb24339cc9358 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Thu, 26 Dec 2024 22:26:04 -0500 Subject: [PATCH 01/20] q6_k extract scale small stuff scale cache there we go float type --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index fab4ff5ff054e..0ae155e0ee493 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -10,6 +10,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; +shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; @@ -20,13 +21,11 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; const uint tid = gl_LocalInvocationID.x; - const uint itid = tid%16; // 0...16 + const uint itid = tid%16; // 0...15 const uint ix = tid/16; - const uint step = 8; - - const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const uint v_in = itid - step*v_im; // 0...15 or 0...7 + const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128... + const uint v_in = itid - 8*v_im; // 0...15 or 0...7 const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28 const uint is = v_in / 4; @@ -50,28 +49,33 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16]; B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24]; + uint ibi = first_row*num_blocks_per_row; [[unroll]] for (uint n = 0; n < num_rows; ++n) { - const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; - const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); + const uint ib0 = a_offset / QUANT_K + ibi; + ibi += num_blocks_per_row; - FLOAT_TYPE scales[4]; - scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]); - scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]); - scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]); - scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]); + // cache full superblock into shared memory with coalesced reads + [[unroll]] for (int l = 0; l < 4; ++l) + blkcache[ix].ql[itid + 16*l] = data_a_packed16[ib0 + i].ql[itid + 16*l]; + [[unroll]] for (int l = 0; l < 2; ++l) + blkcache[ix].qh[itid + 16*l] = data_a_packed16[ib0 + i].qh[itid + 16*l]; + blkcache[ix].scales[itid] = data_a_packed16[ib0 + i].scales[itid]; + barrier(); - uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16); - uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16); + const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); + + uint32_t ql0_u32 = uint32_t(blkcache[ix].ql[ql_offset / 2]) | (uint32_t(blkcache[ix].ql[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(blkcache[ix].ql[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].ql[ql_offset / 2 + 17]) << 16); uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16); + uint32_t qh_u32 = uint32_t(blkcache[ix].qh[qh_offset / 2]) | (uint32_t(blkcache[ix].qh[qh_offset / 2 + 1]) << 16); uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; - uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0; + uint32_t qh4_u32 = (qh_u32 & 0x30303030); uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; @@ -84,14 +88,17 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uvec4 q2 = uvec4(unpack8(q2_u32)); uvec4 q3 = uvec4(unpack8(q3_u32)); - FLOAT_TYPE sum = FLOAT_TYPE(0.0); + FLOAT_TYPE sum[4] = {0, 0, 0, 0}; [[unroll]] for (int l = 0; l < 4; ++l) { - sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32), - fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32), - fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32), - fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum)))); + sum[0] = fma(FLOAT_TYPE(by0[l]), FLOAT_TYPE(int8_t(q0[l]) - 32), sum[0]); + sum[1] = fma(FLOAT_TYPE(by32[l]), FLOAT_TYPE(int8_t(q1[l]) - 32), sum[1]); + sum[2] = fma(FLOAT_TYPE(by64[l]), FLOAT_TYPE(int8_t(q2[l]) - 32), sum[2]); + sum[3] = fma(FLOAT_TYPE(by96[l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]); } - temp[n] += sum * d; + + [[unroll]] for (int l = 0; l < 4; ++l) + sum[l] *= FLOAT_TYPE(blkcache[ix].scales[s_offset + l*2]); + temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * d; } } From 0078ae4e08549b6220f054cdb11b4d55ffcea640 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Fri, 27 Dec 2024 13:20:58 -0500 Subject: [PATCH 02/20] seperate threaded read test, slower somehow --- .../vulkan-shaders/dequant_funcs.comp | 3 ++ .../vulkan-shaders/mul_mat_vec_q6_k.comp | 34 ++++++++++++------- .../src/ggml-vulkan/vulkan-shaders/types.comp | 6 ++++ 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp index 91bb8f8db610e..14490f1a86fee 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp @@ -7,6 +7,9 @@ #if defined(A_TYPE_PACKED16) layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];}; #endif +#if defined(A_TYPE_PACKED16_FLAT) +layout (binding = 0) readonly buffer A_PACKED16_FLAT {A_TYPE_PACKED16_FLAT data_a_packed16_flat[];}; +#endif #if defined(A_TYPE_PACKED32) layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];}; #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 0ae155e0ee493..9392f7e6f278c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -10,7 +10,8 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; -shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; +shared block_q6_K_packed16_flat blkcache[BLOCK_SIZE/16]; +shared FLOAT_TYPE scales[NUM_ROWS/4][16]; void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; @@ -18,7 +19,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint num_blocks_per_row = p.ncols / QUANT_K; - // 16 threads are used to process each block + // 16 thread groups are used to process each block const uint it_size = gl_WorkGroupSize.x/16; const uint tid = gl_LocalInvocationID.x; const uint itid = tid%16; // 0...15 @@ -34,6 +35,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint qh_offset = 32*v_im + l0; const uint s_offset = 8*v_im + is; const uint y_offset = 128*v_im + l0; + const uint shift = (itid%2 == 1) ? 8 : 0; FLOAT_TYPE temp[NUM_ROWS]; @@ -41,7 +43,8 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { temp[i] = FLOAT_TYPE(0); } - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { + [[unroll]] for (uint i0 = 0; i0 < num_blocks_per_row; i0 += it_size) { + uint i = i0 + ix; // 16 thread group specific counter const uint y_idx = i * QUANT_K + y_offset; B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4]; @@ -55,24 +58,29 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { ibi += num_blocks_per_row; // cache full superblock into shared memory with coalesced reads - [[unroll]] for (int l = 0; l < 4; ++l) - blkcache[ix].ql[itid + 16*l] = data_a_packed16[ib0 + i].ql[itid + 16*l]; - [[unroll]] for (int l = 0; l < 2; ++l) - blkcache[ix].qh[itid + 16*l] = data_a_packed16[ib0 + i].qh[itid + 16*l]; - blkcache[ix].scales[itid] = data_a_packed16[ib0 + i].scales[itid]; + // we assume 64 threads here! + [[unroll]] for (int l = 0; (l < 4) && (i0 + l < num_blocks_per_row); ++l) { + blkcache[l].blkd[tid] = data_a_packed16_flat[ib0 + i0 + l].blkd[tid]; + // we read beyond the struct size but it looks like vulkan doesn't care? o_O + // it's faster than using a branch to reduce the number of threads though + blkcache[l].blkd[64 + tid] = data_a_packed16_flat[ib0 + i0 + l].blkd[64 + tid]; + } + scales[ix][itid] = FLOAT_TYPE(int8_t((blkcache[ix].blkd[96 + itid/2] >> shift) & 0xFF)); barrier(); + if (i >= num_blocks_per_row) + continue; - const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); + const FLOAT_TYPE d = FLOAT_TYPE(blkcache[ix][ib0 + i][104]); - uint32_t ql0_u32 = uint32_t(blkcache[ix].ql[ql_offset / 2]) | (uint32_t(blkcache[ix].ql[ql_offset / 2 + 1]) << 16); - uint32_t ql32_u32 = uint32_t(blkcache[ix].ql[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].ql[ql_offset / 2 + 17]) << 16); + uint32_t ql0_u32 = uint32_t(blkcache[ix].blkd[ql_offset / 2]) | (uint32_t(blkcache[ix].blkd[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(blkcache[ix].blkd[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blkd[ql_offset / 2 + 17]) << 16); uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - uint32_t qh_u32 = uint32_t(blkcache[ix].qh[qh_offset / 2]) | (uint32_t(blkcache[ix].qh[qh_offset / 2 + 1]) << 16); + uint32_t qh_u32 = uint32_t(blkcache[ix].blkd[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blkd[64 + qh_offset / 2 + 1]) << 16); uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; uint32_t qh4_u32 = (qh_u32 & 0x30303030); @@ -97,7 +105,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { } [[unroll]] for (int l = 0; l < 4; ++l) - sum[l] *= FLOAT_TYPE(blkcache[ix].scales[s_offset + l*2]); + sum[l] *= scales[ix][s_offset + l*2]; temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * d; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index eecc47f3a9764..a6826dd467715 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -273,10 +273,16 @@ struct block_q6_K_packed16 float16_t d; }; +struct block_q6_K_packed16_flat +{ + uint16_t blkd[105]; +}; + #if defined(DATA_A_Q6_K) #define QUANT_K QUANT_K_Q6_K #define A_TYPE block_q6_K #define A_TYPE_PACKED16 block_q6_K_packed16 +#define A_TYPE_PACKED16_FLAT block_q6_K_packed16_flat #endif // IQuants From a56504fd7bc61a1e9d0417e6f5b269833ddd3bb1 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Fri, 27 Dec 2024 14:54:08 -0500 Subject: [PATCH 03/20] hacky edition --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 +++--- .../ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp | 11 +++-------- ggml/src/ggml-vulkan/vulkan-shaders/types.comp | 4 +++- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c0a43631c8796..d1b812a5d4241 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1877,7 +1877,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {64, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); @@ -1891,7 +1891,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {64, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); @@ -1905,7 +1905,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {64, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); // dequant shaders diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 9392f7e6f278c..d922cafe379bb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -11,7 +11,6 @@ layout (constant_id = 1) const uint NUM_ROWS = 1; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; shared block_q6_K_packed16_flat blkcache[BLOCK_SIZE/16]; -shared FLOAT_TYPE scales[NUM_ROWS/4][16]; void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; @@ -35,7 +34,6 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint qh_offset = 32*v_im + l0; const uint s_offset = 8*v_im + is; const uint y_offset = 128*v_im + l0; - const uint shift = (itid%2 == 1) ? 8 : 0; FLOAT_TYPE temp[NUM_ROWS]; @@ -62,16 +60,13 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { [[unroll]] for (int l = 0; (l < 4) && (i0 + l < num_blocks_per_row); ++l) { blkcache[l].blkd[tid] = data_a_packed16_flat[ib0 + i0 + l].blkd[tid]; // we read beyond the struct size but it looks like vulkan doesn't care? o_O - // it's faster than using a branch to reduce the number of threads though + // this assumes that the struct is packed in continous 16 bit blocks to work blkcache[l].blkd[64 + tid] = data_a_packed16_flat[ib0 + i0 + l].blkd[64 + tid]; } - scales[ix][itid] = FLOAT_TYPE(int8_t((blkcache[ix].blkd[96 + itid/2] >> shift) & 0xFF)); barrier(); if (i >= num_blocks_per_row) continue; - const FLOAT_TYPE d = FLOAT_TYPE(blkcache[ix][ib0 + i][104]); - uint32_t ql0_u32 = uint32_t(blkcache[ix].blkd[ql_offset / 2]) | (uint32_t(blkcache[ix].blkd[ql_offset / 2 + 1]) << 16); uint32_t ql32_u32 = uint32_t(blkcache[ix].blkd[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blkd[ql_offset / 2 + 17]) << 16); @@ -105,8 +100,8 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { } [[unroll]] for (int l = 0; l < 4; ++l) - sum[l] *= scales[ix][s_offset + l*2]; - temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * d; + sum[l] *= FLOAT_TYPE(blkcache[ix].scales[s_offset + l*2]); + temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * FLOAT_TYPE(blkcache[ix].d); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index a6826dd467715..397b2f2252ca4 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -275,7 +275,9 @@ struct block_q6_K_packed16 struct block_q6_K_packed16_flat { - uint16_t blkd[105]; + uint16_t blkd[96]; + int8_t scales[16]; + float16_t d; }; #if defined(DATA_A_Q6_K) From f833b751757ef76f301068a12423eb88c8627740 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Fri, 27 Dec 2024 15:07:34 -0500 Subject: [PATCH 04/20] go all the way --- .../vulkan-shaders/dequant_funcs.comp | 3 --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 16 ++++++++-------- ggml/src/ggml-vulkan/vulkan-shaders/types.comp | 8 -------- 3 files changed, 8 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp index 14490f1a86fee..91bb8f8db610e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp @@ -7,9 +7,6 @@ #if defined(A_TYPE_PACKED16) layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];}; #endif -#if defined(A_TYPE_PACKED16_FLAT) -layout (binding = 0) readonly buffer A_PACKED16_FLAT {A_TYPE_PACKED16_FLAT data_a_packed16_flat[];}; -#endif #if defined(A_TYPE_PACKED32) layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];}; #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index d922cafe379bb..ccaa0486df968 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -10,7 +10,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; -shared block_q6_K_packed16_flat blkcache[BLOCK_SIZE/16]; +shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; @@ -22,7 +22,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint it_size = gl_WorkGroupSize.x/16; const uint tid = gl_LocalInvocationID.x; const uint itid = tid%16; // 0...15 - const uint ix = tid/16; + const uint ix = tid/16; const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128... const uint v_in = itid - 8*v_im; // 0...15 or 0...7 @@ -58,24 +58,24 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // cache full superblock into shared memory with coalesced reads // we assume 64 threads here! [[unroll]] for (int l = 0; (l < 4) && (i0 + l < num_blocks_per_row); ++l) { - blkcache[l].blkd[tid] = data_a_packed16_flat[ib0 + i0 + l].blkd[tid]; - // we read beyond the struct size but it looks like vulkan doesn't care? o_O + blkcache[l].ql[tid] = data_a_packed16[ib0 + i0 + l].ql[tid]; + // hacky method of reading beyond ql and the block struct size but it looks like vulkan doesn't care? o_O // this assumes that the struct is packed in continous 16 bit blocks to work - blkcache[l].blkd[64 + tid] = data_a_packed16_flat[ib0 + i0 + l].blkd[64 + tid]; + blkcache[l].ql[64 + tid] = data_a_packed16[ib0 + i0 + l].ql[64 + tid]; } barrier(); if (i >= num_blocks_per_row) continue; - uint32_t ql0_u32 = uint32_t(blkcache[ix].blkd[ql_offset / 2]) | (uint32_t(blkcache[ix].blkd[ql_offset / 2 + 1]) << 16); - uint32_t ql32_u32 = uint32_t(blkcache[ix].blkd[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blkd[ql_offset / 2 + 17]) << 16); + uint32_t ql0_u32 = uint32_t(blkcache[ix].ql[ql_offset / 2]) | (uint32_t(blkcache[ix].ql[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(blkcache[ix].ql[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].ql[ql_offset / 2 + 17]) << 16); uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - uint32_t qh_u32 = uint32_t(blkcache[ix].blkd[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blkd[64 + qh_offset / 2 + 1]) << 16); + uint32_t qh_u32 = uint32_t(blkcache[ix].qh[qh_offset / 2]) | (uint32_t(blkcache[ix].qh[qh_offset / 2 + 1]) << 16); uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; uint32_t qh4_u32 = (qh_u32 & 0x30303030); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 397b2f2252ca4..eecc47f3a9764 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -273,18 +273,10 @@ struct block_q6_K_packed16 float16_t d; }; -struct block_q6_K_packed16_flat -{ - uint16_t blkd[96]; - int8_t scales[16]; - float16_t d; -}; - #if defined(DATA_A_Q6_K) #define QUANT_K QUANT_K_Q6_K #define A_TYPE block_q6_K #define A_TYPE_PACKED16 block_q6_K_packed16 -#define A_TYPE_PACKED16_FLAT block_q6_K_packed16_flat #endif // IQuants From 25d7ae429df749b7f142f29a5cf77f456e7bffcc Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Fri, 27 Dec 2024 15:44:59 -0500 Subject: [PATCH 05/20] go even further --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index ccaa0486df968..f1464b89947a8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -10,6 +10,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; +shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { @@ -57,12 +58,13 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // cache full superblock into shared memory with coalesced reads // we assume 64 threads here! - [[unroll]] for (int l = 0; (l < 4) && (i0 + l < num_blocks_per_row); ++l) { - blkcache[l].ql[tid] = data_a_packed16[ib0 + i0 + l].ql[tid]; - // hacky method of reading beyond ql and the block struct size but it looks like vulkan doesn't care? o_O - // this assumes that the struct is packed in continous 16 bit blocks to work - blkcache[l].ql[64 + tid] = data_a_packed16[ib0 + i0 + l].ql[64 + tid]; + // + // hacky method of reading beyond ql and the block struct size but it looks like vulkan doesn't care? o_O + // this assumes that the struct is packed in continous 16 bit blocks to work + [[unroll]] for (int l = 0; l < 7; ++l) { + blkcache[0].ql[tid + 64*l] = data_a_packed16[ib0 + i0].ql[tid + 64*l]; } + sccache[ix][itid] = FLOAT_TYPE(blkcache[ix].scales[itid]); barrier(); if (i >= num_blocks_per_row) continue; @@ -100,7 +102,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { } [[unroll]] for (int l = 0; l < 4; ++l) - sum[l] *= FLOAT_TYPE(blkcache[ix].scales[s_offset + l*2]); + sum[l] *= sccache[ix][s_offset + l*2]; temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * FLOAT_TYPE(blkcache[ix].d); } } From b3b30e42fcd6767c617fba4c7eb98db1851bd944 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sat, 28 Dec 2024 00:57:36 +0000 Subject: [PATCH 06/20] Update mul_mat_vec_q6_k.comp --- ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index f1464b89947a8..64aa46169e0cf 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -11,7 +11,7 @@ layout (constant_id = 1) const uint NUM_ROWS = 1; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; -shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; +shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16 + 1]; void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; From 860159c10d2e8fe8cb47602eb5ea9fcceee5f18b Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sat, 28 Dec 2024 22:22:46 -0500 Subject: [PATCH 07/20] new safe method --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 42 ++++++++++++------- .../src/ggml-vulkan/vulkan-shaders/types.comp | 8 ++-- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 64aa46169e0cf..25919ce85e588 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -11,7 +11,7 @@ layout (constant_id = 1) const uint NUM_ROWS = 1; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; -shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16 + 1]; +shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; @@ -33,8 +33,9 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint ql_offset = 64*v_im + l0; const uint qh_offset = 32*v_im + l0; - const uint s_offset = 8*v_im + is; + const uint s_offset = 8*v_im + is; const uint y_offset = 128*v_im + l0; + const uint bcs_offset = (itid%2 == 1) ? 8 : 0; FLOAT_TYPE temp[NUM_ROWS]; @@ -58,26 +59,39 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // cache full superblock into shared memory with coalesced reads // we assume 64 threads here! - // - // hacky method of reading beyond ql and the block struct size but it looks like vulkan doesn't care? o_O - // this assumes that the struct is packed in continous 16 bit blocks to work - [[unroll]] for (int l = 0; l < 7; ++l) { - blkcache[0].ql[tid + 64*l] = data_a_packed16[ib0 + i0].ql[tid + 64*l]; + const int blim = min(int(num_blocks_per_row) - int(i0), 4); + // this is required as this loop is super sensitive to unrolling with hardcoded 4 + if (blim == 4) { + if (tid < 52) { + [[unroll]] for (int l = 0; l < 4; ++l) { + blkcache[l].blk[tid] = data_a_packed16[ib0 + i0 + l].blk[tid]; + blkcache[l].blk[tid + 52] = data_a_packed16[ib0 + i0 + l].blk[tid + 52]; + } + } + } else { + if (tid < 52) { + [[unroll]] for (int l = 0; l < blim; ++l) { + blkcache[l].blk[tid] = data_a_packed16[ib0 + i0 + l].blk[tid]; + blkcache[l].blk[tid + 52] = data_a_packed16[ib0 + i0 + l].blk[tid + 52]; + } + } } - sccache[ix][itid] = FLOAT_TYPE(blkcache[ix].scales[itid]); + sccache[ix][itid] = FLOAT_TYPE(int8_t(bitfieldExtract(blkcache[ix].blk[96 + itid/2], int(bcs_offset), 8))); barrier(); if (i >= num_blocks_per_row) continue; - uint32_t ql0_u32 = uint32_t(blkcache[ix].ql[ql_offset / 2]) | (uint32_t(blkcache[ix].ql[ql_offset / 2 + 1]) << 16); - uint32_t ql32_u32 = uint32_t(blkcache[ix].ql[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].ql[ql_offset / 2 + 17]) << 16); + const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); + + uint32_t ql0_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16); uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - uint32_t qh_u32 = uint32_t(blkcache[ix].qh[qh_offset / 2]) | (uint32_t(blkcache[ix].qh[qh_offset / 2 + 1]) << 16); + uint32_t qh_u32 = uint32_t(blkcache[ix].blk[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blk[64 + qh_offset / 2 + 1]) << 16); uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; uint32_t qh4_u32 = (qh_u32 & 0x30303030); @@ -94,16 +108,16 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uvec4 q3 = uvec4(unpack8(q3_u32)); FLOAT_TYPE sum[4] = {0, 0, 0, 0}; - [[unroll]] for (int l = 0; l < 4; ++l) { + [[unroll]] for (uint l = 0; l < 4; ++l) { sum[0] = fma(FLOAT_TYPE(by0[l]), FLOAT_TYPE(int8_t(q0[l]) - 32), sum[0]); sum[1] = fma(FLOAT_TYPE(by32[l]), FLOAT_TYPE(int8_t(q1[l]) - 32), sum[1]); sum[2] = fma(FLOAT_TYPE(by64[l]), FLOAT_TYPE(int8_t(q2[l]) - 32), sum[2]); sum[3] = fma(FLOAT_TYPE(by96[l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]); } - [[unroll]] for (int l = 0; l < 4; ++l) + [[unroll]] for (uint l = 0; l < 4; ++l) sum[l] *= sccache[ix][s_offset + l*2]; - temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * FLOAT_TYPE(blkcache[ix].d); + temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * d; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index eecc47f3a9764..04698cb4ca288 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -267,9 +267,11 @@ struct block_q6_K struct block_q6_K_packed16 { - uint16_t ql[QUANT_K_Q6_K/2/2]; - uint16_t qh[QUANT_K_Q6_K/4/2]; - int8_t scales[QUANT_K_Q6_K/16]; + // blk contains the following: + // uint16_t ql[QUANT_K_Q6_K/2/2]; + // uint16_t qh[QUANT_K_Q6_K/4/2]; + // uint16_t scales[QUANT_K_Q6_K/8]; + uint16_t blk[104]; float16_t d; }; From 64bb149d53e9023ddc5cad1e9ce488ce11e2ab6d Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sun, 29 Dec 2024 17:05:19 -0500 Subject: [PATCH 08/20] data b cache example, slower than original --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 25919ce85e588..4f60aa8bee646 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -12,6 +12,7 @@ layout (constant_id = 1) const uint NUM_ROWS = 1; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; +shared B_TYPE bycache[BLOCK_SIZE/16][QUANT_K]; void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; @@ -34,7 +35,6 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint ql_offset = 64*v_im + l0; const uint qh_offset = 32*v_im + l0; const uint s_offset = 8*v_im + is; - const uint y_offset = 128*v_im + l0; const uint bcs_offset = (itid%2 == 1) ? 8 : 0; FLOAT_TYPE temp[NUM_ROWS]; @@ -45,12 +45,16 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { [[unroll]] for (uint i0 = 0; i0 < num_blocks_per_row; i0 += it_size) { uint i = i0 + ix; // 16 thread group specific counter - const uint y_idx = i * QUANT_K + y_offset; + const uint y_idx = i0 * QUANT_K; + const int blim = min(int(num_blocks_per_row) - int(i0), 4); - B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4]; - B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8]; - B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16]; - B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24]; + // assume 64 threads + [[unroll]] for (int n = 0; n < blim; ++n) { + [[unroll]] for (int l = 0; l < 4; ++l) { + bycache[n][tid + 64*l] = data_b[b_offset + y_idx + QUANT_K*n + tid + 64*l]; + } + } + barrier(); uint ibi = first_row*num_blocks_per_row; [[unroll]] for (uint n = 0; n < num_rows; ++n) { @@ -59,7 +63,6 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // cache full superblock into shared memory with coalesced reads // we assume 64 threads here! - const int blim = min(int(num_blocks_per_row) - int(i0), 4); // this is required as this loop is super sensitive to unrolling with hardcoded 4 if (blim == 4) { if (tid < 52) { @@ -109,10 +112,10 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { FLOAT_TYPE sum[4] = {0, 0, 0, 0}; [[unroll]] for (uint l = 0; l < 4; ++l) { - sum[0] = fma(FLOAT_TYPE(by0[l]), FLOAT_TYPE(int8_t(q0[l]) - 32), sum[0]); - sum[1] = fma(FLOAT_TYPE(by32[l]), FLOAT_TYPE(int8_t(q1[l]) - 32), sum[1]); - sum[2] = fma(FLOAT_TYPE(by64[l]), FLOAT_TYPE(int8_t(q2[l]) - 32), sum[2]); - sum[3] = fma(FLOAT_TYPE(by96[l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]); + sum[0] = fma(FLOAT_TYPE(bycache[ix][128*v_im + l0 + l]), FLOAT_TYPE(int8_t(q0[l]) - 32), sum[0]); + sum[1] = fma(FLOAT_TYPE(bycache[ix][128*v_im + l0 + 32 + l]), FLOAT_TYPE(int8_t(q1[l]) - 32), sum[1]); + sum[2] = fma(FLOAT_TYPE(bycache[ix][128*v_im + l0 + 64 + l]), FLOAT_TYPE(int8_t(q2[l]) - 32), sum[2]); + sum[3] = fma(FLOAT_TYPE(bycache[ix][128*v_im + l0 + 96 + l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]); } [[unroll]] for (uint l = 0; l < 4; ++l) From e9cbe702b62718330d0c7cc4100fbd2de64e5993 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sun, 29 Dec 2024 20:36:14 -0500 Subject: [PATCH 09/20] unfinished restructure example, didnt continue as its really slow already 15 t/s --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 43 ++++--------------- .../src/ggml-vulkan/vulkan-shaders/types.comp | 4 +- 2 files changed, 10 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 4f60aa8bee646..c1fd30314a15c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -86,41 +86,14 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); - uint32_t ql0_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16); - uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16); - - uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; - uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; - uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; - uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - - uint32_t qh_u32 = uint32_t(blkcache[ix].blk[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blk[64 + qh_offset / 2 + 1]) << 16); - uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; - uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; - uint32_t qh4_u32 = (qh_u32 & 0x30303030); - uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; - - uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; - uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32; - uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32; - uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32; - - uvec4 q0 = uvec4(unpack8(q0_u32)); - uvec4 q1 = uvec4(unpack8(q1_u32)); - uvec4 q2 = uvec4(unpack8(q2_u32)); - uvec4 q3 = uvec4(unpack8(q3_u32)); - - FLOAT_TYPE sum[4] = {0, 0, 0, 0}; - [[unroll]] for (uint l = 0; l < 4; ++l) { - sum[0] = fma(FLOAT_TYPE(bycache[ix][128*v_im + l0 + l]), FLOAT_TYPE(int8_t(q0[l]) - 32), sum[0]); - sum[1] = fma(FLOAT_TYPE(bycache[ix][128*v_im + l0 + 32 + l]), FLOAT_TYPE(int8_t(q1[l]) - 32), sum[1]); - sum[2] = fma(FLOAT_TYPE(bycache[ix][128*v_im + l0 + 64 + l]), FLOAT_TYPE(int8_t(q2[l]) - 32), sum[2]); - sum[3] = fma(FLOAT_TYPE(bycache[ix][128*v_im + l0 + 96 + l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]); - } - - [[unroll]] for (uint l = 0; l < 4; ++l) - sum[l] *= sccache[ix][s_offset + l*2]; - temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * d; + FLOAT_TYPE dq[16]; + FLOAT_TYPE sum = 0; + [[unroll]] for (uint l = 0; l < 16; ++l) { + dq[l] = bitfieldExtract(blkcache[ix].blk[l/2], 1, 4) | (bitfieldExtract(blkcache[ix].blk[64 + l], 2, 2) << 4); + sum = fma(FLOAT_TYPE(bycache[ix][16*itid + l]), dq[l], sum); + } + + temp[n] += sum * sccache[ix][itid] * d; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 04698cb4ca288..7cf898d009b15 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -269,8 +269,8 @@ struct block_q6_K_packed16 { // blk contains the following: // uint16_t ql[QUANT_K_Q6_K/2/2]; - // uint16_t qh[QUANT_K_Q6_K/4/2]; - // uint16_t scales[QUANT_K_Q6_K/8]; + // uint16_t qh[QUANT_K_Q6_K/4/2]; 64 + // uint16_t scales[QUANT_K_Q6_K/8]; 96 uint16_t blk[104]; float16_t d; }; From 5641108a330a9397db6a989bafc435d72aac8a38 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sun, 29 Dec 2024 20:36:59 -0500 Subject: [PATCH 10/20] revert This reverts commit ee88a8999f517f60d9f079b7781fadf623cb1f72. Revert "data b cache example, slower than original" This reverts commit 97ec5a01403028f0b9c7eeaac858cb9652ff958d. --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 58 +++++++++++++------ .../src/ggml-vulkan/vulkan-shaders/types.comp | 4 +- 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index c1fd30314a15c..25919ce85e588 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -12,7 +12,6 @@ layout (constant_id = 1) const uint NUM_ROWS = 1; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; -shared B_TYPE bycache[BLOCK_SIZE/16][QUANT_K]; void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; @@ -35,6 +34,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint ql_offset = 64*v_im + l0; const uint qh_offset = 32*v_im + l0; const uint s_offset = 8*v_im + is; + const uint y_offset = 128*v_im + l0; const uint bcs_offset = (itid%2 == 1) ? 8 : 0; FLOAT_TYPE temp[NUM_ROWS]; @@ -45,16 +45,12 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { [[unroll]] for (uint i0 = 0; i0 < num_blocks_per_row; i0 += it_size) { uint i = i0 + ix; // 16 thread group specific counter - const uint y_idx = i0 * QUANT_K; - const int blim = min(int(num_blocks_per_row) - int(i0), 4); + const uint y_idx = i * QUANT_K + y_offset; - // assume 64 threads - [[unroll]] for (int n = 0; n < blim; ++n) { - [[unroll]] for (int l = 0; l < 4; ++l) { - bycache[n][tid + 64*l] = data_b[b_offset + y_idx + QUANT_K*n + tid + 64*l]; - } - } - barrier(); + B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4]; + B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8]; + B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16]; + B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24]; uint ibi = first_row*num_blocks_per_row; [[unroll]] for (uint n = 0; n < num_rows; ++n) { @@ -63,6 +59,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // cache full superblock into shared memory with coalesced reads // we assume 64 threads here! + const int blim = min(int(num_blocks_per_row) - int(i0), 4); // this is required as this loop is super sensitive to unrolling with hardcoded 4 if (blim == 4) { if (tid < 52) { @@ -86,14 +83,41 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); - FLOAT_TYPE dq[16]; - FLOAT_TYPE sum = 0; - [[unroll]] for (uint l = 0; l < 16; ++l) { - dq[l] = bitfieldExtract(blkcache[ix].blk[l/2], 1, 4) | (bitfieldExtract(blkcache[ix].blk[64 + l], 2, 2) << 4); - sum = fma(FLOAT_TYPE(bycache[ix][16*itid + l]), dq[l], sum); - } + uint32_t ql0_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16); + + uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; + uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; + uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; + uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; + + uint32_t qh_u32 = uint32_t(blkcache[ix].blk[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blk[64 + qh_offset / 2 + 1]) << 16); + uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; + uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; + uint32_t qh4_u32 = (qh_u32 & 0x30303030); + uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; + + uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; + uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32; + uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32; + uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32; + + uvec4 q0 = uvec4(unpack8(q0_u32)); + uvec4 q1 = uvec4(unpack8(q1_u32)); + uvec4 q2 = uvec4(unpack8(q2_u32)); + uvec4 q3 = uvec4(unpack8(q3_u32)); + + FLOAT_TYPE sum[4] = {0, 0, 0, 0}; + [[unroll]] for (uint l = 0; l < 4; ++l) { + sum[0] = fma(FLOAT_TYPE(by0[l]), FLOAT_TYPE(int8_t(q0[l]) - 32), sum[0]); + sum[1] = fma(FLOAT_TYPE(by32[l]), FLOAT_TYPE(int8_t(q1[l]) - 32), sum[1]); + sum[2] = fma(FLOAT_TYPE(by64[l]), FLOAT_TYPE(int8_t(q2[l]) - 32), sum[2]); + sum[3] = fma(FLOAT_TYPE(by96[l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]); + } - temp[n] += sum * sccache[ix][itid] * d; + [[unroll]] for (uint l = 0; l < 4; ++l) + sum[l] *= sccache[ix][s_offset + l*2]; + temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * d; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 7cf898d009b15..04698cb4ca288 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -269,8 +269,8 @@ struct block_q6_K_packed16 { // blk contains the following: // uint16_t ql[QUANT_K_Q6_K/2/2]; - // uint16_t qh[QUANT_K_Q6_K/4/2]; 64 - // uint16_t scales[QUANT_K_Q6_K/8]; 96 + // uint16_t qh[QUANT_K_Q6_K/4/2]; + // uint16_t scales[QUANT_K_Q6_K/8]; uint16_t blk[104]; float16_t d; }; From bdd1e4ddc7bc982cacf534c096956cce661d9bec Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sun, 29 Dec 2024 22:28:27 -0500 Subject: [PATCH 11/20] support different subgroup sizes (tested) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 +- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 60 +++++++++++-------- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index d1b812a5d4241..c0a43631c8796 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1877,7 +1877,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {64, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); @@ -1891,7 +1891,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {64, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); @@ -1905,7 +1905,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {64, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); // dequant shaders diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 25919ce85e588..2d9fd8eb0ea4b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -13,6 +13,29 @@ shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; +uint fill_blkcache_its(uint wg_size) { + // subgroup sizes are always a power of 2 + if (wg_size > 64) + return 1; + else if (wg_size == 64) + return 2; + else if (wg_size == 32) + return 4; + else + return 8; +} + +void fill_blkcache(const int num_blocks, const uint ib0, const uint i0, const uint tid, const uint fbi) { + uint bc_t = 104 / fbi; + if (tid < bc_t) { + [[unroll]] for (int l = 0; l < num_blocks; ++l) { + [[unroll]] for (int m = 0; m < fbi; ++m) + // cache full superblock into shared memory with coalesced reads + blkcache[l].blk[tid + m*bc_t] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; + } + } +} + void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -24,6 +47,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint tid = gl_LocalInvocationID.x; const uint itid = tid%16; // 0...15 const uint ix = tid/16; + const uint fbi = fill_blkcache_its(gl_WorkGroupSize.x); const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128... const uint v_in = itid - 8*v_im; // 0...15 or 0...7 @@ -38,10 +62,8 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint bcs_offset = (itid%2 == 1) ? 8 : 0; FLOAT_TYPE temp[NUM_ROWS]; - - [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { + [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) temp[i] = FLOAT_TYPE(0); - } [[unroll]] for (uint i0 = 0; i0 < num_blocks_per_row; i0 += it_size) { uint i = i0 + ix; // 16 thread group specific counter @@ -55,33 +77,23 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint ibi = first_row*num_blocks_per_row; [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib0 = a_offset / QUANT_K + ibi; - ibi += num_blocks_per_row; + const int blim = min(int(num_blocks_per_row) - int(i0), int(it_size)); - // cache full superblock into shared memory with coalesced reads - // we assume 64 threads here! - const int blim = min(int(num_blocks_per_row) - int(i0), 4); - // this is required as this loop is super sensitive to unrolling with hardcoded 4 - if (blim == 4) { - if (tid < 52) { - [[unroll]] for (int l = 0; l < 4; ++l) { - blkcache[l].blk[tid] = data_a_packed16[ib0 + i0 + l].blk[tid]; - blkcache[l].blk[tid + 52] = data_a_packed16[ib0 + i0 + l].blk[tid + 52]; - } - } + // fill_blkcache is sensitive to unrolling with hardcoded it_size + if (blim == it_size) { + fill_blkcache(int(it_size), ib0, i0, tid, fbi); } else { - if (tid < 52) { - [[unroll]] for (int l = 0; l < blim; ++l) { - blkcache[l].blk[tid] = data_a_packed16[ib0 + i0 + l].blk[tid]; - blkcache[l].blk[tid + 52] = data_a_packed16[ib0 + i0 + l].blk[tid + 52]; - } - } + fill_blkcache(blim, ib0, i0, tid, fbi); } + sccache[ix][itid] = FLOAT_TYPE(int8_t(bitfieldExtract(blkcache[ix].blk[96 + itid/2], int(bcs_offset), 8))); barrier(); + + ibi += num_blocks_per_row; if (i >= num_blocks_per_row) continue; - const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); + const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib0 + i].d); uint32_t ql0_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16); uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16); @@ -115,9 +127,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { sum[3] = fma(FLOAT_TYPE(by96[l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]); } - [[unroll]] for (uint l = 0; l < 4; ++l) - sum[l] *= sccache[ix][s_offset + l*2]; - temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * d; + temp[n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[n]); } } From 5597614a302adc19b4ecdbd0055518f6a3b71215 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 31 Dec 2024 16:37:03 -0500 Subject: [PATCH 12/20] 32 bit cache (slower) --- .../ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 2d9fd8eb0ea4b..5bc1b06144e24 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -9,9 +9,16 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; +// a 32 bit cache potentially might write faster due to banking +struct block_q6_K_32stor +{ + uint32_t blk[104]; + float16_t d; +}; + shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; -shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; +shared block_q6_K_32stor blkcache[BLOCK_SIZE/16]; uint fill_blkcache_its(uint wg_size) { // subgroup sizes are always a power of 2 @@ -31,7 +38,7 @@ void fill_blkcache(const int num_blocks, const uint ib0, const uint i0, const ui [[unroll]] for (int l = 0; l < num_blocks; ++l) { [[unroll]] for (int m = 0; m < fbi; ++m) // cache full superblock into shared memory with coalesced reads - blkcache[l].blk[tid + m*bc_t] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; + blkcache[l].blk[tid + m*bc_t] = uint32_t(data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]); } } } From 77fe42858cd491cc3cd1fce914628ff3933a9caa Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 31 Dec 2024 22:47:54 -0500 Subject: [PATCH 13/20] failed subgroup experiment (slower) --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 51 +++++++------------ 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 5bc1b06144e24..a70a24b0a76a6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -1,6 +1,9 @@ #version 450 #extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_EXT_shader_subgroup_extended_types_int16 : require #include "mul_mat_vec_base.comp" @@ -9,16 +12,11 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; -// a 32 bit cache potentially might write faster due to banking -struct block_q6_K_32stor -{ - uint32_t blk[104]; - float16_t d; -}; +uint16_t blk[BLOCK_SIZE/16][8]; -shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; -shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; -shared block_q6_K_32stor blkcache[BLOCK_SIZE/16]; +uint16_t get_blk_shuffle(uint fbi, uint ix, uint ofst) { + return subgroupShuffle(blk[ix][ofst/(104/fbi)], ofst%(104/fbi)); +} uint fill_blkcache_its(uint wg_size) { // subgroup sizes are always a power of 2 @@ -38,7 +36,7 @@ void fill_blkcache(const int num_blocks, const uint ib0, const uint i0, const ui [[unroll]] for (int l = 0; l < num_blocks; ++l) { [[unroll]] for (int m = 0; m < fbi; ++m) // cache full superblock into shared memory with coalesced reads - blkcache[l].blk[tid + m*bc_t] = uint32_t(data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]); + blk[l][m] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; } } } @@ -64,7 +62,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint ql_offset = 64*v_im + l0; const uint qh_offset = 32*v_im + l0; - const uint s_offset = 8*v_im + is; + const uint s_offset = 16*ix + 8*v_im + is; const uint y_offset = 128*v_im + l0; const uint bcs_offset = (itid%2 == 1) ? 8 : 0; @@ -93,7 +91,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { fill_blkcache(blim, ib0, i0, tid, fbi); } - sccache[ix][itid] = FLOAT_TYPE(int8_t(bitfieldExtract(blkcache[ix].blk[96 + itid/2], int(bcs_offset), 8))); + FLOAT_TYPE sccache = FLOAT_TYPE(int8_t(bitfieldExtract(get_blk_shuffle(fbi, ix, 96 + itid/2), int(bcs_offset), 8))); barrier(); ibi += num_blocks_per_row; @@ -102,15 +100,15 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib0 + i].d); - uint32_t ql0_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16); - uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16); + uint32_t ql0_u32 = uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2)) | (uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 1)) << 16); + uint32_t ql32_u32 = uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 16)) | (uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 17)) << 16); uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - uint32_t qh_u32 = uint32_t(blkcache[ix].blk[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blk[64 + qh_offset / 2 + 1]) << 16); + uint32_t qh_u32 = uint32_t(get_blk_shuffle(fbi, ix, 64 + qh_offset / 2)) | (uint32_t(get_blk_shuffle(fbi, ix, 64 + qh_offset / 2 + 1)) << 16); uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; uint32_t qh4_u32 = (qh_u32 & 0x30303030); @@ -134,28 +132,15 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { sum[3] = fma(FLOAT_TYPE(by96[l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]); } - temp[n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[n]); + temp[n] = fma(fma(sum[0], subgroupShuffle(sccache, s_offset), fma(sum[1], subgroupShuffle(sccache, s_offset + 2), fma(sum[2], subgroupShuffle(sccache, s_offset + 4), sum[3] * subgroupShuffle(sccache, s_offset + 6)))), d, temp[n]); } } // sum up partial sums and write back result - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - tmpsh[n][tid] = temp[n]; - } - barrier(); - [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { - if (tid < s) { - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - tmpsh[n][tid] += tmpsh[n][tid + s]; - } - } - barrier(); - } - if (tid == 0) { - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]); - } - } + [[unroll]] for (uint n = 0; n < num_rows; ++n) + temp[n] = subgroupAdd(temp[n]); + if (tid < num_rows) + data_d[d_offset + first_row + tid] = D_TYPE(temp[tid]); } void main() { From 12f1cdc196775fd64390c54a3e52a4da5b988a15 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 31 Dec 2024 22:55:22 -0500 Subject: [PATCH 14/20] initial subgroup test --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index a70a24b0a76a6..b156a83c8a867 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -1,9 +1,8 @@ #version 450 #extension GL_EXT_shader_explicit_arithmetic_types : require -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_shuffle : require -#extension GL_EXT_shader_subgroup_extended_types_int16 : require +#extension GL_KHR_shader_subgroup_arithmetic: require +#extension GL_KHR_shader_subgroup_shuffle: require #include "mul_mat_vec_base.comp" @@ -12,11 +11,7 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; -uint16_t blk[BLOCK_SIZE/16][8]; - -uint16_t get_blk_shuffle(uint fbi, uint ix, uint ofst) { - return subgroupShuffle(blk[ix][ofst/(104/fbi)], ofst%(104/fbi)); -} +shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; uint fill_blkcache_its(uint wg_size) { // subgroup sizes are always a power of 2 @@ -36,7 +31,7 @@ void fill_blkcache(const int num_blocks, const uint ib0, const uint i0, const ui [[unroll]] for (int l = 0; l < num_blocks; ++l) { [[unroll]] for (int m = 0; m < fbi; ++m) // cache full superblock into shared memory with coalesced reads - blk[l][m] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; + blkcache[l].blk[tid + m*bc_t] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; } } } @@ -91,7 +86,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { fill_blkcache(blim, ib0, i0, tid, fbi); } - FLOAT_TYPE sccache = FLOAT_TYPE(int8_t(bitfieldExtract(get_blk_shuffle(fbi, ix, 96 + itid/2), int(bcs_offset), 8))); + FLOAT_TYPE sccache = FLOAT_TYPE(int8_t(bitfieldExtract(blkcache[ix].blk[96 + itid/2], int(bcs_offset), 8))); barrier(); ibi += num_blocks_per_row; @@ -100,15 +95,15 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib0 + i].d); - uint32_t ql0_u32 = uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2)) | (uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 1)) << 16); - uint32_t ql32_u32 = uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 16)) | (uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 17)) << 16); + uint32_t ql0_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16); uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - uint32_t qh_u32 = uint32_t(get_blk_shuffle(fbi, ix, 64 + qh_offset / 2)) | (uint32_t(get_blk_shuffle(fbi, ix, 64 + qh_offset / 2 + 1)) << 16); + uint32_t qh_u32 = uint32_t(blkcache[ix].blk[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blk[64 + qh_offset / 2 + 1]) << 16); uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; uint32_t qh4_u32 = (qh_u32 & 0x30303030); From d7f4663a7c518241ab0c66cddf284aaf65503771 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 31 Dec 2024 22:55:34 -0500 Subject: [PATCH 15/20] Revert "initial subgroup test" This reverts commit 12f1cdc196775fd64390c54a3e52a4da5b988a15. --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index b156a83c8a867..a70a24b0a76a6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -1,8 +1,9 @@ #version 450 #extension GL_EXT_shader_explicit_arithmetic_types : require -#extension GL_KHR_shader_subgroup_arithmetic: require -#extension GL_KHR_shader_subgroup_shuffle: require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_EXT_shader_subgroup_extended_types_int16 : require #include "mul_mat_vec_base.comp" @@ -11,7 +12,11 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; -shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; +uint16_t blk[BLOCK_SIZE/16][8]; + +uint16_t get_blk_shuffle(uint fbi, uint ix, uint ofst) { + return subgroupShuffle(blk[ix][ofst/(104/fbi)], ofst%(104/fbi)); +} uint fill_blkcache_its(uint wg_size) { // subgroup sizes are always a power of 2 @@ -31,7 +36,7 @@ void fill_blkcache(const int num_blocks, const uint ib0, const uint i0, const ui [[unroll]] for (int l = 0; l < num_blocks; ++l) { [[unroll]] for (int m = 0; m < fbi; ++m) // cache full superblock into shared memory with coalesced reads - blkcache[l].blk[tid + m*bc_t] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; + blk[l][m] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; } } } @@ -86,7 +91,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { fill_blkcache(blim, ib0, i0, tid, fbi); } - FLOAT_TYPE sccache = FLOAT_TYPE(int8_t(bitfieldExtract(blkcache[ix].blk[96 + itid/2], int(bcs_offset), 8))); + FLOAT_TYPE sccache = FLOAT_TYPE(int8_t(bitfieldExtract(get_blk_shuffle(fbi, ix, 96 + itid/2), int(bcs_offset), 8))); barrier(); ibi += num_blocks_per_row; @@ -95,15 +100,15 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib0 + i].d); - uint32_t ql0_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16); - uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16); + uint32_t ql0_u32 = uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2)) | (uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 1)) << 16); + uint32_t ql32_u32 = uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 16)) | (uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 17)) << 16); uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - uint32_t qh_u32 = uint32_t(blkcache[ix].blk[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blk[64 + qh_offset / 2 + 1]) << 16); + uint32_t qh_u32 = uint32_t(get_blk_shuffle(fbi, ix, 64 + qh_offset / 2)) | (uint32_t(get_blk_shuffle(fbi, ix, 64 + qh_offset / 2 + 1)) << 16); uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; uint32_t qh4_u32 = (qh_u32 & 0x30303030); From 7d7a9e2401e95c91a2d1518ce585e50d27bcb9a5 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 31 Dec 2024 22:55:46 -0500 Subject: [PATCH 16/20] Revert "failed subgroup experiment (slower)" This reverts commit 77fe42858cd491cc3cd1fce914628ff3933a9caa. --- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 51 ++++++++++++------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index a70a24b0a76a6..5bc1b06144e24 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -1,9 +1,6 @@ #version 450 #extension GL_EXT_shader_explicit_arithmetic_types : require -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_shuffle : require -#extension GL_EXT_shader_subgroup_extended_types_int16 : require #include "mul_mat_vec_base.comp" @@ -12,11 +9,16 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; -uint16_t blk[BLOCK_SIZE/16][8]; +// a 32 bit cache potentially might write faster due to banking +struct block_q6_K_32stor +{ + uint32_t blk[104]; + float16_t d; +}; -uint16_t get_blk_shuffle(uint fbi, uint ix, uint ofst) { - return subgroupShuffle(blk[ix][ofst/(104/fbi)], ofst%(104/fbi)); -} +shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; +shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; +shared block_q6_K_32stor blkcache[BLOCK_SIZE/16]; uint fill_blkcache_its(uint wg_size) { // subgroup sizes are always a power of 2 @@ -36,7 +38,7 @@ void fill_blkcache(const int num_blocks, const uint ib0, const uint i0, const ui [[unroll]] for (int l = 0; l < num_blocks; ++l) { [[unroll]] for (int m = 0; m < fbi; ++m) // cache full superblock into shared memory with coalesced reads - blk[l][m] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; + blkcache[l].blk[tid + m*bc_t] = uint32_t(data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]); } } } @@ -62,7 +64,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint ql_offset = 64*v_im + l0; const uint qh_offset = 32*v_im + l0; - const uint s_offset = 16*ix + 8*v_im + is; + const uint s_offset = 8*v_im + is; const uint y_offset = 128*v_im + l0; const uint bcs_offset = (itid%2 == 1) ? 8 : 0; @@ -91,7 +93,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { fill_blkcache(blim, ib0, i0, tid, fbi); } - FLOAT_TYPE sccache = FLOAT_TYPE(int8_t(bitfieldExtract(get_blk_shuffle(fbi, ix, 96 + itid/2), int(bcs_offset), 8))); + sccache[ix][itid] = FLOAT_TYPE(int8_t(bitfieldExtract(blkcache[ix].blk[96 + itid/2], int(bcs_offset), 8))); barrier(); ibi += num_blocks_per_row; @@ -100,15 +102,15 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib0 + i].d); - uint32_t ql0_u32 = uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2)) | (uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 1)) << 16); - uint32_t ql32_u32 = uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 16)) | (uint32_t(get_blk_shuffle(fbi, ix, ql_offset / 2 + 17)) << 16); + uint32_t ql0_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16); uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - uint32_t qh_u32 = uint32_t(get_blk_shuffle(fbi, ix, 64 + qh_offset / 2)) | (uint32_t(get_blk_shuffle(fbi, ix, 64 + qh_offset / 2 + 1)) << 16); + uint32_t qh_u32 = uint32_t(blkcache[ix].blk[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blk[64 + qh_offset / 2 + 1]) << 16); uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; uint32_t qh4_u32 = (qh_u32 & 0x30303030); @@ -132,15 +134,28 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { sum[3] = fma(FLOAT_TYPE(by96[l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]); } - temp[n] = fma(fma(sum[0], subgroupShuffle(sccache, s_offset), fma(sum[1], subgroupShuffle(sccache, s_offset + 2), fma(sum[2], subgroupShuffle(sccache, s_offset + 4), sum[3] * subgroupShuffle(sccache, s_offset + 6)))), d, temp[n]); + temp[n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[n]); } } // sum up partial sums and write back result - [[unroll]] for (uint n = 0; n < num_rows; ++n) - temp[n] = subgroupAdd(temp[n]); - if (tid < num_rows) - data_d[d_offset + first_row + tid] = D_TYPE(temp[tid]); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] = temp[n]; + } + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { + if (tid < s) { + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] += tmpsh[n][tid + s]; + } + } + barrier(); + } + if (tid == 0) { + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]); + } + } } void main() { From 3c31ceac88308567c48c26957b3d990138c09ee3 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 31 Dec 2024 22:55:51 -0500 Subject: [PATCH 17/20] Revert "32 bit cache (slower)" This reverts commit 5597614a302adc19b4ecdbd0055518f6a3b71215. --- .../ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 5bc1b06144e24..2d9fd8eb0ea4b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -9,16 +9,9 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; -// a 32 bit cache potentially might write faster due to banking -struct block_q6_K_32stor -{ - uint32_t blk[104]; - float16_t d; -}; - shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; -shared block_q6_K_32stor blkcache[BLOCK_SIZE/16]; +shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16]; uint fill_blkcache_its(uint wg_size) { // subgroup sizes are always a power of 2 @@ -38,7 +31,7 @@ void fill_blkcache(const int num_blocks, const uint ib0, const uint i0, const ui [[unroll]] for (int l = 0; l < num_blocks; ++l) { [[unroll]] for (int m = 0; m < fbi; ++m) // cache full superblock into shared memory with coalesced reads - blkcache[l].blk[tid + m*bc_t] = uint32_t(data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]); + blkcache[l].blk[tid + m*bc_t] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; } } } From ed038a26e37da36a3df5eba8be3cd8a199ecbab5 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 31 Dec 2024 23:00:16 -0500 Subject: [PATCH 18/20] bct --- .../ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 2d9fd8eb0ea4b..d8ce4ed9f91c6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -25,13 +25,12 @@ uint fill_blkcache_its(uint wg_size) { return 8; } -void fill_blkcache(const int num_blocks, const uint ib0, const uint i0, const uint tid, const uint fbi) { - uint bc_t = 104 / fbi; - if (tid < bc_t) { +void fill_blkcache(const int num_blocks, const uint bct, const uint ib0, const uint i0, const uint tid, const uint fbi) { + if (tid < bct) { [[unroll]] for (int l = 0; l < num_blocks; ++l) { [[unroll]] for (int m = 0; m < fbi; ++m) // cache full superblock into shared memory with coalesced reads - blkcache[l].blk[tid + m*bc_t] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t]; + blkcache[l].blk[tid + m*bct] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bct]; } } } @@ -48,6 +47,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint itid = tid%16; // 0...15 const uint ix = tid/16; const uint fbi = fill_blkcache_its(gl_WorkGroupSize.x); + const uint bct = 104/fbi; const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128... const uint v_in = itid - 8*v_im; // 0...15 or 0...7 @@ -81,9 +81,9 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // fill_blkcache is sensitive to unrolling with hardcoded it_size if (blim == it_size) { - fill_blkcache(int(it_size), ib0, i0, tid, fbi); + fill_blkcache(int(it_size), bct, ib0, i0, tid, fbi); } else { - fill_blkcache(blim, ib0, i0, tid, fbi); + fill_blkcache(blim, bct, ib0, i0, tid, fbi); } sccache[ix][itid] = FLOAT_TYPE(int8_t(bitfieldExtract(blkcache[ix].blk[96 + itid/2], int(bcs_offset), 8))); From 1d949a62c63d1bd3e53df27de3f9eb0a5de83205 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Wed, 1 Jan 2025 16:50:22 -0500 Subject: [PATCH 19/20] subgroup iq4_nl, 3% slower than original --- ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp | 6 ++++-- ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp | 6 ++++-- ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp | 4 +++- ggml/src/ggml-vulkan/vulkan-shaders/types.comp | 7 ++++--- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp index 91bb8f8db610e..35eb0c05c7408 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp @@ -1,6 +1,8 @@ #if !defined(DATA_A_F32) && !defined(DATA_A_F16) #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #endif +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_EXT_shader_subgroup_extended_types_float16 : require #include "types.comp" @@ -91,11 +93,11 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) { #if defined(DATA_A_IQ4_NL) vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); - return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]); + return vec2(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 4)); } vec4 dequantize4(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); - return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]); + return vec4(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, (vui >> 4) & 0xF), subgroupShuffle(kvalues_iq4nl, (vui >> 8) & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 12)); } #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp index 8de14fc03f102..2303031d0a030 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp @@ -1,6 +1,8 @@ #version 450 #include "dequant_head.comp" +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_EXT_shader_subgroup_extended_types_float16 : require layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; @@ -26,7 +28,7 @@ void main() { const float d = float(data_a[ib].d); [[unroll]] for (uint l = 0; l < 8; ++l) { - data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]); - data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]); + data_b[b_idx + l + 0] = D_TYPE(d * subgroupShuffle(kvalues_iq4nl, data_a[ib].qs[q_idx + l] & 0xF)); + data_b[b_idx + l + 16] = D_TYPE(d * subgroupShuffle(kvalues_iq4nl, data_a[ib].qs[q_idx + l] >> 4)); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 48122cbef906e..e411948949681 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -2,6 +2,8 @@ #extension GL_EXT_control_flow_attributes : enable #extension GL_EXT_shader_16bit_storage : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_EXT_shader_subgroup_extended_types_float16 : require #ifdef FLOAT16 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require @@ -448,7 +450,7 @@ void main() { const float d = float(data_a[ib].d); const uint vui = uint(data_a[ib].qs[iqs]); - const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d; + const vec2 v = vec2(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 4)) * d; buf_a[buf_idx ] = FLOAT_TYPE(v.x); buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 04698cb4ca288..ea576444fb8ca 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -3,6 +3,7 @@ #define GGML_TYPES_COMP #extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_KHR_shader_subgroup_basic : require #if defined(DATA_A_F32) #define QUANT_K 1 @@ -305,13 +306,13 @@ const int8_t kvalues_iq4nl_const[16] = { int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113) }; -shared FLOAT_TYPE kvalues_iq4nl[16]; +FLOAT_TYPE kvalues_iq4nl = FLOAT_TYPE(0); void init_iq4nl_shmem() { // copy the table into shared memory and sync - if (gl_LocalInvocationIndex.x < 16) { - kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]); + if (gl_SubgroupInvocationID < 16) { + kvalues_iq4nl = FLOAT_TYPE(kvalues_iq4nl_const[gl_SubgroupInvocationID]); } barrier(); } From c3efd7df73ebfe8fe371b2edf629a8cf089aaf7e Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Wed, 1 Jan 2025 16:50:37 -0500 Subject: [PATCH 20/20] Revert "subgroup iq4_nl, 3% slower than original" This reverts commit 1d949a62c63d1bd3e53df27de3f9eb0a5de83205. --- ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp | 6 ++---- ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp | 6 ++---- ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp | 4 +--- ggml/src/ggml-vulkan/vulkan-shaders/types.comp | 7 +++---- 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp index 35eb0c05c7408..91bb8f8db610e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp @@ -1,8 +1,6 @@ #if !defined(DATA_A_F32) && !defined(DATA_A_F16) #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #endif -#extension GL_KHR_shader_subgroup_shuffle : require -#extension GL_EXT_shader_subgroup_extended_types_float16 : require #include "types.comp" @@ -93,11 +91,11 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) { #if defined(DATA_A_IQ4_NL) vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); - return vec2(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 4)); + return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]); } vec4 dequantize4(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); - return vec4(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, (vui >> 4) & 0xF), subgroupShuffle(kvalues_iq4nl, (vui >> 8) & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 12)); + return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]); } #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp index 2303031d0a030..8de14fc03f102 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp @@ -1,8 +1,6 @@ #version 450 #include "dequant_head.comp" -#extension GL_KHR_shader_subgroup_shuffle : require -#extension GL_EXT_shader_subgroup_extended_types_float16 : require layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; @@ -28,7 +26,7 @@ void main() { const float d = float(data_a[ib].d); [[unroll]] for (uint l = 0; l < 8; ++l) { - data_b[b_idx + l + 0] = D_TYPE(d * subgroupShuffle(kvalues_iq4nl, data_a[ib].qs[q_idx + l] & 0xF)); - data_b[b_idx + l + 16] = D_TYPE(d * subgroupShuffle(kvalues_iq4nl, data_a[ib].qs[q_idx + l] >> 4)); + data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]); + data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index e411948949681..48122cbef906e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -2,8 +2,6 @@ #extension GL_EXT_control_flow_attributes : enable #extension GL_EXT_shader_16bit_storage : require -#extension GL_KHR_shader_subgroup_shuffle : require -#extension GL_EXT_shader_subgroup_extended_types_float16 : require #ifdef FLOAT16 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require @@ -450,7 +448,7 @@ void main() { const float d = float(data_a[ib].d); const uint vui = uint(data_a[ib].qs[iqs]); - const vec2 v = vec2(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 4)) * d; + const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d; buf_a[buf_idx ] = FLOAT_TYPE(v.x); buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index ea576444fb8ca..04698cb4ca288 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -3,7 +3,6 @@ #define GGML_TYPES_COMP #extension GL_EXT_shader_explicit_arithmetic_types : require -#extension GL_KHR_shader_subgroup_basic : require #if defined(DATA_A_F32) #define QUANT_K 1 @@ -306,13 +305,13 @@ const int8_t kvalues_iq4nl_const[16] = { int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113) }; -FLOAT_TYPE kvalues_iq4nl = FLOAT_TYPE(0); +shared FLOAT_TYPE kvalues_iq4nl[16]; void init_iq4nl_shmem() { // copy the table into shared memory and sync - if (gl_SubgroupInvocationID < 16) { - kvalues_iq4nl = FLOAT_TYPE(kvalues_iq4nl_const[gl_SubgroupInvocationID]); + if (gl_LocalInvocationIndex.x < 16) { + kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]); } barrier(); }