Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vulkan: experimental coalesced read to shared memory before dequantization #10999

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 66 additions & 30 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -10,68 +10,103 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
layout (constant_id = 1) const uint NUM_ROWS = 1;

shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16];

uint fill_blkcache_its(uint wg_size) {
// subgroup sizes are always a power of 2
if (wg_size > 64)
return 1;
else if (wg_size == 64)
return 2;
else if (wg_size == 32)
return 4;
else
return 8;
}

void fill_blkcache(const int num_blocks, const uint bct, const uint ib0, const uint i0, const uint tid, const uint fbi) {
if (tid < bct) {
[[unroll]] for (int l = 0; l < num_blocks; ++l) {
[[unroll]] for (int m = 0; m < fbi; ++m)
// cache full superblock into shared memory with coalesced reads
blkcache[l].blk[tid + m*bct] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bct];
}
}
}

void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);

const uint num_blocks_per_row = p.ncols / QUANT_K;

// 16 threads are used to process each block
// 16 thread groups are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
const uint tid = gl_LocalInvocationID.x;
const uint itid = tid%16; // 0...16
const uint ix = tid/16;

const uint step = 8;
const uint itid = tid%16; // 0...15
const uint ix = tid/16;
const uint fbi = fill_blkcache_its(gl_WorkGroupSize.x);
const uint bct = 104/fbi;

const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - step*v_im; // 0...15 or 0...7
const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - 8*v_im; // 0...15 or 0...7

const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28
const uint is = v_in / 4;

const uint ql_offset = 64*v_im + l0;
const uint qh_offset = 32*v_im + l0;
const uint s_offset = 8*v_im + is;
const uint s_offset = 8*v_im + is;
const uint y_offset = 128*v_im + l0;
const uint bcs_offset = (itid%2 == 1) ? 8 : 0;

FLOAT_TYPE temp[NUM_ROWS];

[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i)
temp[i] = FLOAT_TYPE(0);
}

[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
[[unroll]] for (uint i0 = 0; i0 < num_blocks_per_row; i0 += it_size) {
uint i = i0 + ix; // 16 thread group specific counter
const uint y_idx = i * QUANT_K + y_offset;

B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4];
B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];

uint ibi = first_row*num_blocks_per_row;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
const uint ib0 = a_offset / QUANT_K + ibi;
const int blim = min(int(num_blocks_per_row) - int(i0), int(it_size));

// fill_blkcache is sensitive to unrolling with hardcoded it_size
if (blim == it_size) {
fill_blkcache(int(it_size), bct, ib0, i0, tid, fbi);
} else {
fill_blkcache(blim, bct, ib0, i0, tid, fbi);
}

FLOAT_TYPE scales[4];
scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
sccache[ix][itid] = FLOAT_TYPE(int8_t(bitfieldExtract(blkcache[ix].blk[96 + itid/2], int(bcs_offset), 8)));
barrier();

uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
ibi += num_blocks_per_row;
if (i >= num_blocks_per_row)
continue;

const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib0 + i].d);

uint32_t ql0_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16);
uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16);

uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;

uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
uint32_t qh_u32 = uint32_t(blkcache[ix].blk[64 + qh_offset / 2]) | (uint32_t(blkcache[ix].blk[64 + qh_offset / 2 + 1]) << 16);
uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
uint32_t qh4_u32 = (qh_u32 & 0x30303030);
uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;

uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32;
Expand All @@ -84,14 +119,15 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uvec4 q2 = uvec4(unpack8(q2_u32));
uvec4 q3 = uvec4(unpack8(q3_u32));

FLOAT_TYPE sum = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 4; ++l) {
sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
FLOAT_TYPE sum[4] = {0, 0, 0, 0};
[[unroll]] for (uint l = 0; l < 4; ++l) {
sum[0] = fma(FLOAT_TYPE(by0[l]), FLOAT_TYPE(int8_t(q0[l]) - 32), sum[0]);
sum[1] = fma(FLOAT_TYPE(by32[l]), FLOAT_TYPE(int8_t(q1[l]) - 32), sum[1]);
sum[2] = fma(FLOAT_TYPE(by64[l]), FLOAT_TYPE(int8_t(q2[l]) - 32), sum[2]);
sum[3] = fma(FLOAT_TYPE(by96[l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]);
}
temp[n] += sum * d;

temp[n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[n]);
}
}

Expand Down
8 changes: 5 additions & 3 deletions ggml/src/ggml-vulkan/vulkan-shaders/types.comp
Original file line number Diff line number Diff line change
Expand Up @@ -267,9 +267,11 @@ struct block_q6_K

struct block_q6_K_packed16
{
uint16_t ql[QUANT_K_Q6_K/2/2];
uint16_t qh[QUANT_K_Q6_K/4/2];
int8_t scales[QUANT_K_Q6_K/16];
// blk contains the following:
// uint16_t ql[QUANT_K_Q6_K/2/2];
// uint16_t qh[QUANT_K_Q6_K/4/2];
// uint16_t scales[QUANT_K_Q6_K/8];
uint16_t blk[104];
float16_t d;
};

Expand Down
Loading