From bd807499f7cd08deaf5c7073ca5ef72279bbb1ad Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 19 Jun 2024 12:21:08 -0400 Subject: [PATCH 01/28] ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b --- convert-hf-to-gguf.py | 25 ++- examples/quantize/quantize.cpp | 2 + ggml/include/ggml.h | 2 + ggml/src/ggml-common.h | 117 +++++++++++ ggml/src/ggml-quants.c | 353 ++++++++++++++++++++++++++++++++- ggml/src/ggml-quants.h | 10 + ggml/src/ggml.c | 27 ++- gguf-py/gguf/constants.py | 6 + gguf-py/gguf/quants.py | 50 +++++ include/llama.h | 2 + src/llama.cpp | 4 + 11 files changed, 594 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c26fad9307f15..85ec3620c80e8 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -296,12 +296,27 @@ def write_tensors(self): )) if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: - if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + if self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 and not any( + self.match_model_tensor_name(new_name, key, None) + for key in [ + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + ] + ): + data = gguf.quantize_q1_3(data) + assert data.dtype == np.uint8 + data_qtype = gguf.GGMLQuantizationType.Q1_3 + + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: data = gguf.quantize_bf16(data) assert data.dtype == np.int16 data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): + elif ( + self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 + or self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 + and gguf.can_quantize_to_q8_0(data) + ): data = gguf.quantize_q8_0(data) assert data.dtype == np.uint8 data_qtype = gguf.GGMLQuantizationType.Q8_0 @@ -1412,6 +1427,12 @@ def write_tensors(self): class BitnetModel(Model): model_arch = gguf.MODEL_ARCH.BITNET + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, *args, **kwargs): + if ftype == gguf.LlamaFileType.GUESSED: + ftype = gguf.LlamaFileType.MOSTLY_Q1_3 + + super().__init__(dir_model, ftype, *args, **kwargs) + def set_vocab(self): self._set_vocab_sentencepiece() diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 76e2052d55d79..43241df6087c7 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,6 +26,8 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, + { "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet 1.58b", }, + { "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet 1.58b", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d895c9acdb596..951494314f4e2 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -383,6 +383,8 @@ extern "C" { GGML_TYPE_F64 = 28, GGML_TYPE_IQ1_M = 29, GGML_TYPE_BF16 = 30, + GGML_TYPE_Q2_2 = 31, + GGML_TYPE_Q1_3 = 32, GGML_TYPE_COUNT, }; diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index e8efceb760d40..fd5d8a90a874b 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -137,6 +137,20 @@ typedef sycl::half2 ggml_half2; #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP +// 1.625 bpw for BitNet 1.58b models +#define QK1_3 64 +typedef struct { + uint8_t q[(QK1_3 - 4*QK1_3/64)/5]; // 5 elements per byte (3^5 = 243 < 256) + uint8_t qs[QK1_3/64]; // 4 elements per byte +} block_q1_3; +static_assert(sizeof(block_q1_3) == (QK1_3 - 4*QK1_3/64)/5 + QK1_3/64, "wrong q1_3 block size/padding"); + +#define QK2_2 32 +typedef struct { + uint8_t qs[QK2_2 / 4]; // nibbles / quants +} block_q2_2; +static_assert(sizeof(block_q2_2) == QK2_2 / 4, "wrong q2_2 block size/padding"); + #define QK4_0 32 typedef struct { ggml_half d; // delta @@ -333,6 +347,7 @@ typedef struct { } block_iq3_s; static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding"); +// 1.5625 bpw typedef struct { ggml_half d; uint8_t qs[QK_K/8]; @@ -1022,6 +1037,108 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512) 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, GGML_TABLE_END() +GGML_TABLE_BEGIN(uint32_t, q22_grid, 256) + 0x00000000, 0x01000000, 0x00000000, 0xff000000, + 0x00010000, 0x01010000, 0x00010000, 0xff010000, + 0x00000000, 0x01000000, 0x00000000, 0xff000000, + 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, + 0x00000100, 0x01000100, 0x00000100, 0xff000100, + 0x00010100, 0x01010100, 0x00010100, 0xff010100, + 0x00000100, 0x01000100, 0x00000100, 0xff000100, + 0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100, + 0x00000000, 0x01000000, 0x00000000, 0xff000000, + 0x00010000, 0x01010000, 0x00010000, 0xff010000, + 0x00000000, 0x01000000, 0x00000000, 0xff000000, + 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, + 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, + 0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00, + 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, + 0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00, + 0x00000001, 0x01000001, 0x00000001, 0xff000001, + 0x00010001, 0x01010001, 0x00010001, 0xff010001, + 0x00000001, 0x01000001, 0x00000001, 0xff000001, + 0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001, + 0x00000101, 0x01000101, 0x00000101, 0xff000101, + 0x00010101, 0x01010101, 0x00010101, 0xff010101, + 0x00000101, 0x01000101, 0x00000101, 0xff000101, + 0x00ff0101, 0x01ff0101, 0x00ff0101, 0xffff0101, + 0x00000001, 0x01000001, 0x00000001, 0xff000001, + 0x00010001, 0x01010001, 0x00010001, 0xff010001, + 0x00000001, 0x01000001, 0x00000001, 0xff000001, + 0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001, + 0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01, + 0x0001ff01, 0x0101ff01, 0x0001ff01, 0xff01ff01, + 0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01, + 0x00ffff01, 0x01ffff01, 0x00ffff01, 0xffffff01, + 0x00000000, 0x01000000, 0x00000000, 0xff000000, + 0x00010000, 0x01010000, 0x00010000, 0xff010000, + 0x00000000, 0x01000000, 0x00000000, 0xff000000, + 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, + 0x00000100, 0x01000100, 0x00000100, 0xff000100, + 0x00010100, 0x01010100, 0x00010100, 0xff010100, + 0x00000100, 0x01000100, 0x00000100, 0xff000100, + 0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100, + 0x00000000, 0x01000000, 0x00000000, 0xff000000, + 0x00010000, 0x01010000, 0x00010000, 0xff010000, + 0x00000000, 0x01000000, 0x00000000, 0xff000000, + 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, + 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, + 0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00, + 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, + 0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00, + 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, + 0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff, + 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, + 0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff, + 0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff, + 0x000101ff, 0x010101ff, 0x000101ff, 0xff0101ff, + 0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff, + 0x00ff01ff, 0x01ff01ff, 0x00ff01ff, 0xffff01ff, + 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, + 0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff, + 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, + 0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff, + 0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff, + 0x0001ffff, 0x0101ffff, 0x0001ffff, 0xff01ffff, + 0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff, + 0x00ffffff, 0x01ffffff, 0x00ffffff, 0xffffffff, +GGML_TABLE_END() + +GGML_TABLE_BEGIN(uint32_t, q1_3_grid, 256) + 0xffffffff, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff, + 0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff000000, 0xff000001, + 0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff, + 0xff010000, 0xff010001, 0xff0101ff, 0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01, + 0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00, + 0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101, + 0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000, 0x00010001, 0x000101ff, 0x00010100, + 0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001, + 0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000, + 0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x0101ff01, + 0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00, + 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff, + 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100, + 0xff000101, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff, + 0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0000, + 0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff, + 0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01, + 0x000100ff, 0x00010000, 0x00010000, 0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff, + 0x01ffff00, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101, + 0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x01000001, 0x010001ff, + 0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001, + 0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, + 0xffff0001, 0xffff01ff, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, + 0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ff00, + 0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff, 0xff0101ff, 0xff010100, 0xff010101, + 0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100, + 0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff, + 0x00000100, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000, + 0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ff00ff, + 0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x01ff0101, 0x0100ffff, 0x0100ff00, + 0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff, + 0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101, +GGML_TABLE_END() + #define NGRID_IQ1S 2048 #define IQ1S_DELTA 0.125f #define IQ1M_DELTA 0.125f diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 0eb52e485089f..349db94c7a82d 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -1630,7 +1630,7 @@ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int6 // ===================== Helper functions // static inline int nearest_int(float fval) { - assert(fval <= 4194303.f); + assert(fabsf(fval) <= 4194303.f); float val = fval + 12582912.f; int i; memcpy(&i, &val, sizeof(int)); return (i & 0x007fffff) - 0x00400000; @@ -3306,6 +3306,140 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } +size_t quantize_q2_2(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + const size_t row_size = ggml_row_size(GGML_TYPE_Q2_2, n_per_row); + quantize_row_q2_2_reference(src, dst, (int64_t)nrow*n_per_row); + return nrow * row_size; +} + +// ====================== 1.625 bpw (de)-quantization (BitNet 1.58b) + +void quantize_row_q1_3_reference(const float * restrict x, block_q1_3 * restrict y, int64_t k) { + assert(k % QK1_3 == 0); + const int64_t nb = k / QK1_3; + static_assert(sizeof(y->q) % 4 == 0, "bad block_q1_3.q size"); + + const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; + + for (int64_t i = 0; i < nb; ++i) { + uint8_t q[sizeof(y->q)] = {0}; + for (size_t j = 0; j < sizeof(y->q); ++j) { + for (size_t m = 0; m < 4; ++m) { + int xi = nearest_int(x[m]); + uint8_t xt = xi < 0 ? 0 : xi == 0 ? 1 : 2; + q[j] += xt * pow3[m]; + } + x += 4; + } + for (size_t j = 0; j < sizeof(y->q); ++j) { + int xi = nearest_int(x[j]); + uint8_t xt = xi < 0 ? 0 : xi == 0 ? 1 : 2; + q[j] += xt * pow3[4]; + q[j] = ((uint16_t)q[j] * 256) / pow3[5]; + q[j] += (uint8_t)(q[j] != 0); + y[i].q[j] = q[j]; + } + x += sizeof(y->q); + + for (size_t j = 0; j < sizeof(y->qs); ++j) { + uint8_t qb = 0; + for (size_t m = 0; m < 4; ++m) { + int xi = nearest_int(x[m]); + uint8_t xt = xi < 0 ? 0 : xi == 0 ? 1 : 2; + qb += xt * pow3[m]; + } + x += 4; + qb = ((uint16_t)qb * 256) / pow3[5]; + qb += (uint8_t)(qb != 0); + y[i].qs[j] = qb; + } + } +} + +void quantize_row_q1_3(const float * restrict x, void * restrict vy, int64_t k) { + assert(k % QK1_3 == 0); + block_q1_3 * restrict y = vy; + quantize_row_q1_3_reference(x, y, k); +} + +size_t quantize_q1_3(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + const size_t row_size = ggml_row_size(GGML_TYPE_Q1_3, n_per_row); + quantize_row_q1_3(src, dst, (int64_t)nrow*n_per_row); + return nrow * row_size; +} + +void dequantize_row_q1_3(const block_q1_3 * restrict x, float * restrict y, int64_t k) { + assert(k % QK1_3 == 0); + const int64_t nb = k / QK1_3; + static_assert(sizeof(x->q) % 4 == 0, "bad block_q1_3.q size"); + +// #if defined(__SSE2__) +// __m128 vscale = _mm_set1_ps(scale); + +// for (int64_t i = 0; i < nb; ++i) { +// for (size_t j = 0; j < sizeof(x->q); j += 4) { +// __m128 q1 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 0]])); +// __m128 q2 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 1]])); +// __m128 q3 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 2]])); +// __m128 q4 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 3]])); +// q1 = _mm_mul_ps(q1, vscale); +// q2 = _mm_mul_ps(q2, vscale); +// q3 = _mm_mul_ps(q3, vscale); +// q4 = _mm_mul_ps(q4, vscale); + +// _mm_store_ps(y + 0, q1); +// _mm_store_ps(y + 4, q2); +// _mm_store_ps(y + 8, q3); +// _mm_store_ps(y + 12, q4); +// y += 16; +// } + +// for (size_t j = 0; j < sizeof(x->q); j += 4) { +// __m128i q5i = _mm_loadu_si32(x[i].q + j); +// q5i = _mm_cvtepi8_epi16(q5i); +// q5i = _mm_add_epi16(q5i, _mm_add_epi16(q5i, q5i)); +// q5i = _mm_srli_epi16(q5i, 8); +// q5i = _mm_sub_epi16(q5i, _mm_set1_epi16(1)); +// __m128 q5 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(q5i)); +// q5 = _mm_mul_ps(q5, vscale); + +// _mm_store_ps(y, q5); +// y += 4; +// } + +// for (size_t j = 0; j < sizeof(x->qs); ++j) { +// __m128 q = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].qs[j]])); +// q = _mm_mul_ps(q, vscale); +// _mm_store_ps(y, q); +// y += 4; +// } +// } +// #else + for (int64_t i = 0; i < nb; ++i) { + for (size_t j = 0; j < sizeof(x->q); ++j) { + const int8_t * q = (const int8_t *) (q1_3_grid + x[i].q[j]); + for (int m = 0; m < 4; ++m) { + *y++ = (float) q[m]; + } + } + + for (size_t j = 0; j < sizeof(x->q); ++j) { + uint16_t q = x[i].q[j]; + *y++ = (float) ((int16_t)((q * 3) >> 8) - 1); + } + + for (size_t j = 0; j < sizeof(x->qs); ++j) { + const int8_t * q = (const int8_t *) (q1_3_grid + x[i].qs[j]); + for (int m = 0; m < 4; ++m) { + *y++ = (float) q[m]; + } + } + } +// #endif +} + // ====================== "True" 2-bit (de)-quantization void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) { @@ -3726,6 +3860,122 @@ static inline __m128i get_scale_shuffle(int i) { } #endif +void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_2 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__AVX2__) + __m256 acc = _mm256_setzero_ps(); + + int leftovers = nb % 2; + + for (int i = 0; i < nb - leftovers; i += 2) { + + const __m256 d0 = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i + 0].d) ); + const __m256 d1 = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i + 1].d) ); + + // assuming two consecutive blocks are contiguous AND aligned + __m128i xq16b = _mm_load_si128((const __m128i *) (x[i].qs)); + __m256i xq16 = MM256_SET_M128I(xq16b, xq16b); + __m256i xq8l0 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, + 4, -1, 4, -1, 4, -1, 4, -1, + 1, -1, 1, -1, 1, -1, 1, -1, + 0, -1, 0, -1, 0, -1, 0, -1)); + __m256i xq8h0 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, + 6, -1, 6, -1, 6, -1, 6, -1, + 3, -1, 3, -1, 3, -1, 3, -1, + 2, -1, 2, -1, 2, -1, 2, -1)); + __m256i xq8l1 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(13, -1, 13, -1, 13, -1, 13, -1, + 12, -1, 12, -1, 12, -1, 12, -1, + 9, -1, 9, -1, 9, -1, 9, -1, + 8, -1, 8, -1, 8, -1, 8, -1)); + __m256i xq8h1 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(15, -1, 15, -1, 15, -1, 15, -1, + 14, -1, 14, -1, 14, -1, 14, -1, + 11, -1, 11, -1, 11, -1, 11, -1, + 10, -1, 10, -1, 10, -1, 10, -1)); + __m256i shift = _mm256_set_epi16(64, 16, 4, 1, + 64, 16, 4, 1, + 64, 16, 4, 1, + 64, 16, 4, 1); + xq8l0 = _mm256_mullo_epi16(xq8l0, shift); + xq8h0 = _mm256_mullo_epi16(xq8h0, shift); + xq8l1 = _mm256_mullo_epi16(xq8l1, shift); + xq8h1 = _mm256_mullo_epi16(xq8h1, shift); + xq8l0 = _mm256_srai_epi16(xq8l0, 14); + xq8h0 = _mm256_srai_epi16(xq8h0, 14); + xq8l1 = _mm256_srai_epi16(xq8l1, 14); + xq8h1 = _mm256_srai_epi16(xq8h1, 14); + __m256i xq8_0 = _mm256_packs_epi16(xq8l0, xq8h0); + __m256i xq8_1 = _mm256_packs_epi16(xq8l1, xq8h1); + + __m256i yq8_0 = _mm256_lddqu_si256((const __m256i *) (y[i + 0].qs)); + __m256i yq8_1 = _mm256_lddqu_si256((const __m256i *) (y[i + 1].qs)); + + const __m256 q0 = mul_sum_i8_pairs_float(xq8_0, yq8_0); + const __m256 q1 = mul_sum_i8_pairs_float(xq8_1, yq8_1); + + acc = _mm256_fmadd_ps( d0, q0, acc ); + acc = _mm256_fmadd_ps( d1, q1, acc ); + } + + for (int i = nb - leftovers; i < nb; ++i) { + + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i].d) ); + + __m128i xq8b = _mm_loadu_si64(x[i].qs); + __m256i xq8 = MM256_SET_M128I(xq8b, xq8b); + __m256i xq8l = _mm256_shuffle_epi8(xq8, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, + 4, -1, 4, -1, 4, -1, 4, -1, + 1, -1, 1, -1, 1, -1, 1, -1, + 0, -1, 0, -1, 0, -1, 0, -1)); + __m256i xq8h = _mm256_shuffle_epi8(xq8, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, + 6, -1, 6, -1, 6, -1, 6, -1, + 3, -1, 3, -1, 3, -1, 3, -1, + 2, -1, 2, -1, 2, -1, 2, -1)); + __m256i shift = _mm256_set_epi16(64, 16, 4, 1, + 64, 16, 4, 1, + 64, 16, 4, 1, + 64, 16, 4, 1); + xq8l = _mm256_mullo_epi16(xq8l, shift); + xq8h = _mm256_mullo_epi16(xq8h, shift); + xq8l = _mm256_srai_epi16(xq8l, 14); + xq8h = _mm256_srai_epi16(xq8h, 14); + xq8 = _mm256_packs_epi16(xq8l, xq8h); + + __m256i yq8 = _mm256_lddqu_si256((const __m256i *) (y[i].qs)); + const __m256 q = mul_sum_i8_pairs_float(xq8, yq8); + + acc = _mm256_fmadd_ps( d, q, acc ); + } + + *s = hsum_float_8(acc); +#else + + float sumf = 0.0; + for (int i = 0; i < nb; i++) { + int sumi = 0; + for (int j = 0; j < qk / 4; j++) { + const int8_t* weight = (const int8_t *)(q22_grid + x[i].qs[j]); + sumi += (int)y[i].qs[4*j+0] * weight[0]; + sumi += (int)y[i].qs[4*j+1] * weight[1]; + sumi += (int)y[i].qs[4*j+2] * weight[2]; + sumi += (int)y[i].qs[4*j+3] * weight[3]; + } + sumf += (float)(sumi)*(GGML_FP16_TO_FP32(y[i].d)); + } + *s = sumf; +#endif +} + void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; @@ -11102,6 +11352,105 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { } #endif +void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + // assumed by the code below + assert(n % QK1_3 == 0); + static_assert(QK1_3 == 2 * QK8_0, "QK1_3 must be 2 times bigger than QK8_0"); + + const block_q1_3 * restrict x = vx; + const block_q8_0 * restrict y = vy; + + const int nb = n / QK1_3; + +#if defined(__AVX2__) + __m256 accumf = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + { + __m256i x0 = _mm256_set_epi32(q1_3_grid[x[i].q[7]], q1_3_grid[x[i].q[6]], + q1_3_grid[x[i].q[5]], q1_3_grid[x[i].q[4]], + q1_3_grid[x[i].q[3]], q1_3_grid[x[i].q[2]], + q1_3_grid[x[i].q[1]], q1_3_grid[x[i].q[0]]); + __m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i].qs)); + + __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i].d)); + + __m256 q = mul_sum_i8_pairs_float(x0, y0); + + accumf = _mm256_fmadd_ps(d, q, accumf); + } + + { + __m256i x1 = _mm256_castsi128_si256(_mm_set_epi32(q1_3_grid[x[i].q[11]], q1_3_grid[x[i].q[10]], + q1_3_grid[x[i].q[9]], q1_3_grid[x[i].q[8]])); + __m256i x2 = _mm256_cvtepu8_epi16(_mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1))); + __m256i y1 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 1].qs)); + + x2 = _mm256_mulhi_epu16(x2, _mm256_set1_epi16(3 << 8)); + x2 = _mm256_sub_epi16(x2, _mm256_set1_epi16(1)); + + // TODO: reduce shuffling + x2 = _mm256_packs_epi16(x2, _mm256_setzero_si256()); + x2 = _mm256_permute4x64_epi64(x2, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i x2_l = _mm_insert_epi32(_mm256_castsi256_si128(x2), q1_3_grid[x[i].qs[0]], 3); + x1 = _mm256_inserti128_si256(x1, x2_l, 1); + + __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d)); + + __m256 q = mul_sum_i8_pairs_float(x1, y1); + + accumf = _mm256_fmadd_ps(d, q, accumf); + } + } + + *s = hsum_float_8(accumf); +#else + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int sum = 0; + for (int j = 0; j < 8; ++j) { + const int8_t * xj = (const int8_t *) (q1_3_grid + x[i].q[j]); + for (int k = 0; k < 4; ++k) { + sum += xj[k] * (int16_t) y[2*i].qs[4*j + k]; + } + } + + sumf += GGML_FP16_TO_FP32(y[2*i].d) * sum; + sum = 0; + + for (int j = 0; j < 4; ++j) { + const int8_t * xj = (const int8_t *) (q1_3_grid + x[i].q[8 + j]); + for (int k = 0; k < 4; ++k) { + sum += xj[k] * (int16_t) y[2*i + 1].qs[4*j + k]; + } + } + + for (size_t j = 0; j < 12; ++j) { + uint16_t xj = x[i].q[j]; + xj = (xj * 3) >> 8; + sum += ((int16_t) xj - 1) * (int16_t) y[2*i + 1].qs[16 + j]; + } + + { + const int8_t * xj = (const int8_t *) (q1_3_grid + x[i].qs[0]); + for (int k = 0; k < 4; ++k) { + sum += (int16_t) xj[k] * (int16_t) y[2*i + 1].qs[28 + k]; + } + } + + sumf += GGML_FP16_TO_FP32(y[2*i + 1].d) * sum; + } + + *s = sumf; +#endif +} + void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -14977,6 +15326,8 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; + case GGML_TYPE_Q1_3: + case GGML_TYPE_Q2_2: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 4d436a8f06b3e..fe28132ad82bf 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -12,6 +12,8 @@ extern "C" { #endif // Quantization +void quantize_row_q1_3_reference(const float * GGML_RESTRICT x, block_q1_3 * GGML_RESTRICT y, int64_t k); +void quantize_row_q2_2_reference(const float * GGML_RESTRICT x, block_q2_2 * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); @@ -32,6 +34,8 @@ void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); +void quantize_row_q1_3(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q2_2(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -53,6 +57,8 @@ void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); // Dequantization +void dequantize_row_q1_3(const block_q1_3 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q2_2(const block_q2_2 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -78,6 +84,8 @@ void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_ void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); // Dot product +void ggml_vec_dot_q1_3_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q2_2_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -116,6 +124,8 @@ size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q1_3(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q2_2(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f5502afbe98b3..8c444d0b6f67f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -823,6 +823,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_Q1_3] = { + .type_name = "q1_3", + .blck_size = QK1_3, + .type_size = sizeof(block_q1_3), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q1_3, + .from_float = quantize_row_q1_3, + .from_float_reference = (ggml_from_float_t) quantize_row_q1_3_reference, + .vec_dot = ggml_vec_dot_q1_3_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, [GGML_TYPE_IQ1_S] = { .type_name = "iq1_s", .blck_size = QK_K, @@ -10041,7 +10053,16 @@ static void ggml_compute_forward_mul_f32( GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); - if (nb10 == sizeof(float)) { + if (ggml_nelements(src1) == 1) { + float scale = ((float *) src1->data)[0]; + for (int64_t ir = ith; ir < nr; ir += nth) { + if (dst->data != src0->data) { + // src0 is same shape as dst => same indices + memcpy((char *)dst->data + ir*nb1, (char *)src0->data + ir*nb01, ne0 * sizeof(float)); + } + ggml_vec_scale_f32(ne0, (float *) ((char *) dst->data + ir*nb1), scale); + } + } else if (nb10 == sizeof(float)) { for (int64_t ir = ith; ir < nr; ir += nth) { // src0 and dst are same shape => same indices const int64_t i03 = ir/(ne02*ne01); @@ -13713,6 +13734,8 @@ static void ggml_compute_forward_clamp( } break; case GGML_TYPE_F16: case GGML_TYPE_BF16: + case GGML_TYPE_Q1_3: + case GGML_TYPE_Q2_2: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -20438,6 +20461,8 @@ size_t ggml_quantize_chunk( size_t result = 0; switch (type) { + case GGML_TYPE_Q1_3: result = quantize_q1_3(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q2_2: result = quantize_q2_2(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 222a2d137b08f..bf51a81a900f1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1023,6 +1023,8 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 + Q2_2 = 31 + Q1_3 = 32 # TODO: add GGMLFileType from ggml_ftype in ggml.h @@ -1064,6 +1066,8 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_XS = 30 # except 1d tensors MOSTLY_IQ1_M = 31 # except 1d tensors MOSTLY_BF16 = 32 # except 1d tensors + MOSTLY_Q2_2 = 33 # except 1d tensors + MOSTLY_Q1_3 = 34 # except 1d tensors GUESSED = 1024 # not specified in the model file @@ -1137,6 +1141,8 @@ def get_type(val: Any) -> GGUFValueType: GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), GGMLQuantizationType.BF16: (1, 2), + GGMLQuantizationType.Q2_2: (32, 8), + GGMLQuantizationType.Q1_3: (64, 12 + 1), } diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index b22eec1661ce7..a2beb0d53375a 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -121,3 +121,53 @@ def quantize_q8_0(data: np.ndarray): return __quantize_q8_0_lazy(data) else: return __quantize_q8_0_array(data) + + +__q1_3_block_size, __q1_3_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q1_3] + + +def __quantize_q1_3_shape_change(s: tuple[int, ...]) -> tuple[int, ...]: + return (*s[:-1], s[-1] // __q1_3_block_size * __q1_3_type_size) + + +def __quantize_q1_3_rows(n: np.ndarray) -> np.ndarray: + shape = n.shape + assert shape[-1] % __q1_3_block_size == 0 + + n_blocks = n.size // __q1_3_block_size + + blocks = n.reshape((n_blocks, __q1_3_block_size)).astype(np.float32, copy=False) + + # assuming the weights are pre-scaled + blocks = (np.sign(blocks).astype(np.int8) + 1).view(np.uint8) + q48, rest = np.hsplit(blocks, (48,)) + q12, q4 = np.hsplit(rest, (12,)) + + pow3 = np.array([1, 3, 9, 27]) + q48 = q48.reshape((n_blocks, 12, 4)) + q48 = np.sum(q48 * pow3.reshape((1, 1, 4)), axis=2, keepdims=True).reshape((n_blocks, 12)) + q4 = np.sum(q4 * pow3.reshape((1, 4)), axis=1, keepdims=True) + q48 = q48 + (q12 * 81) + q = np.concatenate([q48, q4], axis=1); + q = ((q.astype(np.uint16) * 256) // 243).astype(np.uint8) + q = np.where(q != 0, q + 1, 0); + + return q.reshape(__quantize_q1_3_shape_change(shape)) + + +def __quantize_q1_3_array(n: np.ndarray) -> np.ndarray: + return __apply_over_grouped_rows(__quantize_q1_3_rows, arr=n, otype=np.uint8, oshape=__quantize_q1_3_shape_change(n.shape)) + + +__quantize_q1_3_lazy = LazyNumpyTensor._wrap_fn( + __quantize_q1_3_array, + meta_noop=(np.uint8, __quantize_q1_3_shape_change), +) + + +def quantize_q1_3(data: np.ndarray): + if type(data) is LazyNumpyTensor: + return __quantize_q1_3_lazy(data) + else: + return __quantize_q1_3_array(data) + diff --git a/include/llama.h b/include/llama.h index 88eecb0edb17e..492c7ec6b54bc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -158,6 +158,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_2 = 33, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q1_3 = 34, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index f78594a6f7c49..fa2d97e65d472 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4186,6 +4186,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "F16"; case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; + case LLAMA_FTYPE_MOSTLY_Q1_3: return "Q1_3 - 1.625 bpw for BitNet 1.58b"; + case LLAMA_FTYPE_MOSTLY_Q2_2: return "Q2_2 - 2.000 bpw for BitNet 1.58b"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: @@ -16287,6 +16289,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llama_ftype ftype = params->ftype; switch (params->ftype) { + case LLAMA_FTYPE_MOSTLY_Q1_3: default_type = GGML_TYPE_Q1_3; break; + case LLAMA_FTYPE_MOSTLY_Q2_2: default_type = GGML_TYPE_Q2_2; break; case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; From 7ef4254a92628077bcfe03f0d68a58e5e68d1b4b Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 19 Jun 2024 14:34:32 -0400 Subject: [PATCH 02/28] ggml-quants : faster 1.625 bpw AVX2 vec_dot Not using a lookup table anymore makes it match q4_0 speed. * gguf-py : fix formatting * llama : remove spaces on empty line --- ggml/src/ggml-quants.c | 84 ++++++++++++++++++++++++++++-------------- gguf-py/gguf/quants.py | 5 +-- 2 files changed, 58 insertions(+), 31 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 349db94c7a82d..c20afaf3a2edd 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -11371,40 +11371,68 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - { - __m256i x0 = _mm256_set_epi32(q1_3_grid[x[i].q[7]], q1_3_grid[x[i].q[6]], - q1_3_grid[x[i].q[5]], q1_3_grid[x[i].q[4]], - q1_3_grid[x[i].q[3]], q1_3_grid[x[i].q[2]], - q1_3_grid[x[i].q[1]], q1_3_grid[x[i].q[0]]); - __m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i].qs)); - - __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i].d)); - - __m256 q = mul_sum_i8_pairs_float(x0, y0); - - accumf = _mm256_fmadd_ps(d, q, accumf); - } + // __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1)); + // __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12); + // WARNING: reading 3 bytes further than necessary. It's faster than the above on my CPU, though. + __m128i x12b = _mm_loadu_si128((const __m128i_u *) x[i].q); + __m256i x12 = MM256_SET_M128I(x12b, x12b); { - __m256i x1 = _mm256_castsi128_si256(_mm_set_epi32(q1_3_grid[x[i].q[11]], q1_3_grid[x[i].q[10]], - q1_3_grid[x[i].q[9]], q1_3_grid[x[i].q[8]])); - __m256i x2 = _mm256_cvtepu8_epi16(_mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1))); + __m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, + 4, -1, 4, -1, 4, -1, 4, -1, + 1, -1, 1, -1, 1, -1, 1, -1, + 0, -1, 0, -1, 0, -1, 0, -1)); + __m256i x0h = _mm256_shuffle_epi8(x12, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, + 6, -1, 6, -1, 6, -1, 6, -1, + 3, -1, 3, -1, 3, -1, 3, -1, + 2, -1, 2, -1, 2, -1, 2, -1)); + __m256i x1l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(7, -1, 6, -1, 5, -1, 4, -1, + 3, -1, 2, -1, 1, -1, 0, -1, + 9, -1, 9, -1, 9, -1, 9, -1, + 8, -1, 8, -1, 8, -1, 8, -1)); + __m256i x1h = _mm256_shuffle_epi8(x12, _mm256_set_epi8(12, -1, 12, -1, 12, -1, 12, -1, + 11, -1, 10, -1, 9, -1, 8, -1, + 11, -1, 11, -1, 11, -1, 11, -1, + 10, -1, 10, -1, 10, -1, 10, -1)); + const __m256i shift0 = _mm256_set_epi16(3, 9, 27, 81, + 3, 9, 27, 81, + 3, 9, 27, 81, + 3, 9, 27, 81); + const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1, + 1, 1, 1, 1, + 3, 9, 27, 81, + 3, 9, 27, 81); + const __m256i shift1h = _mm256_set_epi16(3, 9, 27, 81, + 1, 1, 1, 1, + 3, 9, 27, 81, + 3, 9, 27, 81); + x0l = _mm256_mullo_epi16(x0l, shift0); + x0h = _mm256_mullo_epi16(x0h, shift0); + x1l = _mm256_mullo_epi16(x1l, shift1l); + x1h = _mm256_mullo_epi16(x1h, shift1h); + x0l = _mm256_mulhi_epu16(x0l, _mm256_set1_epi16(3)); + x0h = _mm256_mulhi_epu16(x0h, _mm256_set1_epi16(3)); + x1l = _mm256_mulhi_epu16(x1l, _mm256_set1_epi16(3)); + x1h = _mm256_mulhi_epu16(x1h, _mm256_set1_epi16(3)); + x0l = _mm256_sub_epi16(x0l, _mm256_set1_epi16(1)); + x0h = _mm256_sub_epi16(x0h, _mm256_set1_epi16(1)); + x1l = _mm256_sub_epi16(x1l, _mm256_set1_epi16(1)); + x1h = _mm256_sub_epi16(x1h, _mm256_set1_epi16(1)); + + __m256i x0 = _mm256_packs_epi16(x0l, x0h); + __m256i x1 = _mm256_packs_epi16(x1l, x1h); + + __m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 0].qs)); __m256i y1 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 1].qs)); - x2 = _mm256_mulhi_epu16(x2, _mm256_set1_epi16(3 << 8)); - x2 = _mm256_sub_epi16(x2, _mm256_set1_epi16(1)); - - // TODO: reduce shuffling - x2 = _mm256_packs_epi16(x2, _mm256_setzero_si256()); - x2 = _mm256_permute4x64_epi64(x2, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i x2_l = _mm_insert_epi32(_mm256_castsi256_si128(x2), q1_3_grid[x[i].qs[0]], 3); - x1 = _mm256_inserti128_si256(x1, x2_l, 1); - - __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d)); + __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i].d)); + __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d)); - __m256 q = mul_sum_i8_pairs_float(x1, y1); + __m256 q0 = mul_sum_i8_pairs_float(x0, y0); + __m256 q1 = mul_sum_i8_pairs_float(x1, y1); - accumf = _mm256_fmadd_ps(d, q, accumf); + accumf = _mm256_fmadd_ps(d0, q0, accumf); + accumf = _mm256_fmadd_ps(d1, q1, accumf); } } diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index a2beb0d53375a..46820dce3b288 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -148,9 +148,9 @@ def __quantize_q1_3_rows(n: np.ndarray) -> np.ndarray: q48 = np.sum(q48 * pow3.reshape((1, 1, 4)), axis=2, keepdims=True).reshape((n_blocks, 12)) q4 = np.sum(q4 * pow3.reshape((1, 4)), axis=1, keepdims=True) q48 = q48 + (q12 * 81) - q = np.concatenate([q48, q4], axis=1); + q = np.concatenate([q48, q4], axis=1) q = ((q.astype(np.uint16) * 256) // 243).astype(np.uint8) - q = np.where(q != 0, q + 1, 0); + q = np.where(q != 0, q + 1, 0) return q.reshape(__quantize_q1_3_shape_change(shape)) @@ -170,4 +170,3 @@ def quantize_q1_3(data: np.ndarray): return __quantize_q1_3_lazy(data) else: return __quantize_q1_3_array(data) - From 48b73b849880ff43f0dd818252cf00cea5a83061 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 19 Jun 2024 17:50:34 -0400 Subject: [PATCH 03/28] ggml-quants : substract 1 when back in epi8 This makes the 1.625 bpw type go faster than q4_0. Still not the fastest. --- ggml/src/ggml-quants.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index c20afaf3a2edd..050197545302f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3917,8 +3917,8 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r __m256i xq8_0 = _mm256_packs_epi16(xq8l0, xq8h0); __m256i xq8_1 = _mm256_packs_epi16(xq8l1, xq8h1); - __m256i yq8_0 = _mm256_lddqu_si256((const __m256i *) (y[i + 0].qs)); - __m256i yq8_1 = _mm256_lddqu_si256((const __m256i *) (y[i + 1].qs)); + __m256i yq8_0 = _mm256_loadu_si256((const __m256i *) (y[i + 0].qs)); + __m256i yq8_1 = _mm256_loadu_si256((const __m256i *) (y[i + 1].qs)); const __m256 q0 = mul_sum_i8_pairs_float(xq8_0, yq8_0); const __m256 q1 = mul_sum_i8_pairs_float(xq8_1, yq8_1); @@ -3951,7 +3951,7 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r xq8h = _mm256_srai_epi16(xq8h, 14); xq8 = _mm256_packs_epi16(xq8l, xq8h); - __m256i yq8 = _mm256_lddqu_si256((const __m256i *) (y[i].qs)); + __m256i yq8 = _mm256_loadu_si256((const __m256i *) (y[i].qs)); const __m256 q = mul_sum_i8_pairs_float(xq8, yq8); acc = _mm256_fmadd_ps( d, q, acc ); @@ -11371,11 +11371,12 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - // __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1)); - // __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12); - // WARNING: reading 3 bytes further than necessary. It's faster than the above on my CPU, though. - __m128i x12b = _mm_loadu_si128((const __m128i_u *) x[i].q); - __m256i x12 = MM256_SET_M128I(x12b, x12b); + // const __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1)); + // const __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12); + // WARNING: reading 3 bytes further than necessary. + // It's measurably faster than a masked load on an Intel Core m3-8100Y + const __m128i x12b = _mm_loadu_si128((const __m128i_u *) (x[i].q)); + const __m256i x12 = MM256_SET_M128I(x12b, x12b); { __m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, @@ -11406,6 +11407,7 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r 1, 1, 1, 1, 3, 9, 27, 81, 3, 9, 27, 81); + // extract ternary values x0l = _mm256_mullo_epi16(x0l, shift0); x0h = _mm256_mullo_epi16(x0h, shift0); x1l = _mm256_mullo_epi16(x1l, shift1l); @@ -11414,22 +11416,22 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r x0h = _mm256_mulhi_epu16(x0h, _mm256_set1_epi16(3)); x1l = _mm256_mulhi_epu16(x1l, _mm256_set1_epi16(3)); x1h = _mm256_mulhi_epu16(x1h, _mm256_set1_epi16(3)); - x0l = _mm256_sub_epi16(x0l, _mm256_set1_epi16(1)); - x0h = _mm256_sub_epi16(x0h, _mm256_set1_epi16(1)); - x1l = _mm256_sub_epi16(x1l, _mm256_set1_epi16(1)); - x1h = _mm256_sub_epi16(x1h, _mm256_set1_epi16(1)); __m256i x0 = _mm256_packs_epi16(x0l, x0h); __m256i x1 = _mm256_packs_epi16(x1l, x1h); - __m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 0].qs)); - __m256i y1 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 1].qs)); + // 0, 1, 2 => -1, 0, 1 + x0 = _mm256_sub_epi8(x0, _mm256_set1_epi8(1)); + x1 = _mm256_sub_epi8(x1, _mm256_set1_epi8(1)); - __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i].d)); - __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d)); + const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 0].qs)); + const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 1].qs)); - __m256 q0 = mul_sum_i8_pairs_float(x0, y0); - __m256 q1 = mul_sum_i8_pairs_float(x1, y1); + const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 0].d)); + const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d)); + + const __m256 q0 = mul_sum_i8_pairs_float(x0, y0); + const __m256 q1 = mul_sum_i8_pairs_float(x1, y1); accumf = _mm256_fmadd_ps(d0, q0, accumf); accumf = _mm256_fmadd_ps(d1, q1, accumf); From ef1e345c85561b63ba9c791b38e76d3f0df5f2bb Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 19 Jun 2024 22:12:43 -0400 Subject: [PATCH 04/28] ggml-quants : Q2_2 now faster than Q4_K on with AVX2 --- convert-hf-to-gguf.py | 2 + ggml/src/ggml-common.h | 67 -------------------- ggml/src/ggml-quants.c | 141 ++++++++++++++++++----------------------- 3 files changed, 65 insertions(+), 145 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 85ec3620c80e8..a8aef09b93369 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -296,6 +296,8 @@ def write_tensors(self): )) if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: + # TODO: cleaner model-specific per-tensor types + # NOTE: Q1_3 is only relevant for BitNet 1.58b if self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 and not any( self.match_model_tensor_name(new_name, key, None) for key in [ diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index fd5d8a90a874b..9c680e3b1c05f 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -1037,73 +1037,6 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512) 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, GGML_TABLE_END() -GGML_TABLE_BEGIN(uint32_t, q22_grid, 256) - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00010000, 0x01010000, 0x00010000, 0xff010000, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, - 0x00000100, 0x01000100, 0x00000100, 0xff000100, - 0x00010100, 0x01010100, 0x00010100, 0xff010100, - 0x00000100, 0x01000100, 0x00000100, 0xff000100, - 0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00010000, 0x01010000, 0x00010000, 0xff010000, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, - 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, - 0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00, - 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, - 0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00, - 0x00000001, 0x01000001, 0x00000001, 0xff000001, - 0x00010001, 0x01010001, 0x00010001, 0xff010001, - 0x00000001, 0x01000001, 0x00000001, 0xff000001, - 0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001, - 0x00000101, 0x01000101, 0x00000101, 0xff000101, - 0x00010101, 0x01010101, 0x00010101, 0xff010101, - 0x00000101, 0x01000101, 0x00000101, 0xff000101, - 0x00ff0101, 0x01ff0101, 0x00ff0101, 0xffff0101, - 0x00000001, 0x01000001, 0x00000001, 0xff000001, - 0x00010001, 0x01010001, 0x00010001, 0xff010001, - 0x00000001, 0x01000001, 0x00000001, 0xff000001, - 0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001, - 0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01, - 0x0001ff01, 0x0101ff01, 0x0001ff01, 0xff01ff01, - 0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01, - 0x00ffff01, 0x01ffff01, 0x00ffff01, 0xffffff01, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00010000, 0x01010000, 0x00010000, 0xff010000, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, - 0x00000100, 0x01000100, 0x00000100, 0xff000100, - 0x00010100, 0x01010100, 0x00010100, 0xff010100, - 0x00000100, 0x01000100, 0x00000100, 0xff000100, - 0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00010000, 0x01010000, 0x00010000, 0xff010000, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, - 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, - 0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00, - 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, - 0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00, - 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, - 0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff, - 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, - 0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff, - 0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff, - 0x000101ff, 0x010101ff, 0x000101ff, 0xff0101ff, - 0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff, - 0x00ff01ff, 0x01ff01ff, 0x00ff01ff, 0xffff01ff, - 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, - 0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff, - 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, - 0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff, - 0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff, - 0x0001ffff, 0x0101ffff, 0x0001ffff, 0xff01ffff, - 0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff, - 0x00ffffff, 0x01ffffff, 0x00ffffff, 0xffffffff, -GGML_TABLE_END() - GGML_TABLE_BEGIN(uint32_t, q1_3_grid, 256) 0xffffffff, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff000000, 0xff000001, diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 050197545302f..4d7c6ff61ea52 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -657,6 +657,35 @@ static inline __m128i packNibbles( __m256i bytes ) { } #endif //__loongarch_asx +void quantize_row_q2_2_reference(const float * restrict x, block_q2_2 * restrict y, int64_t k) { + static const int qk = QK2_2; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + + for (int j = 0; j < qk/4; ++j) { + int8_t x0 = (int8_t)x[i*qk + 0 + j]; + int8_t x1 = (int8_t)x[i*qk + 1*qk/4 + j]; + int8_t x2 = (int8_t)x[i*qk + 2*qk/4 + j]; + int8_t x3 = (int8_t)x[i*qk + 3*qk/4 + j]; + + const uint8_t xi0 = x0 < 0 ? 1 : x0 == 0 ? 2 : 3; + const uint8_t xi1 = x1 < 0 ? 1 : x1 == 0 ? 2 : 3; + const uint8_t xi2 = x2 < 0 ? 1 : x2 == 0 ? 2 : 3; + const uint8_t xi3 = x3 < 0 ? 1 : x3 == 0 ? 2 : 3; + + y[i].qs[j] = 0; + y[i].qs[j] |= (xi0 << 0); + y[i].qs[j] |= (xi1 << 2); + y[i].qs[j] |= (xi2 << 4); + y[i].qs[j] |= (xi3 << 6); + } + } +} + // reference implementation for deterministic creation of model files void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) { static const int qk = QK4_0; @@ -1512,6 +1541,26 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) #endif } +void dequantize_row_q2_2(const block_q2_2 * restrict x, float * restrict y, int64_t k) { + static const int qk = QK2_2; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + + for (int j = 0; j < qk/4; ++j) { + const int8_t q = x[i].qs[j]; + + y[i*qk + j + 0 ] = (float) (((q >> 0) & 3) - 2); + y[i*qk + j + 1*qk/4] = (float) (((q >> 2) & 3) - 2); + y[i*qk + j + 2*qk/4] = (float) (((q >> 4) & 3) - 2); + y[i*qk + j + 3*qk/4] = (float) (((q >> 6) & 3) - 2); + } + } +} + void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) { static const int qk = QK4_0; @@ -3876,82 +3925,18 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r #if defined(__AVX2__) __m256 acc = _mm256_setzero_ps(); - int leftovers = nb % 2; - - for (int i = 0; i < nb - leftovers; i += 2) { - - const __m256 d0 = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i + 0].d) ); - const __m256 d1 = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i + 1].d) ); - - // assuming two consecutive blocks are contiguous AND aligned - __m128i xq16b = _mm_load_si128((const __m128i *) (x[i].qs)); - __m256i xq16 = MM256_SET_M128I(xq16b, xq16b); - __m256i xq8l0 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, - 4, -1, 4, -1, 4, -1, 4, -1, - 1, -1, 1, -1, 1, -1, 1, -1, - 0, -1, 0, -1, 0, -1, 0, -1)); - __m256i xq8h0 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, - 6, -1, 6, -1, 6, -1, 6, -1, - 3, -1, 3, -1, 3, -1, 3, -1, - 2, -1, 2, -1, 2, -1, 2, -1)); - __m256i xq8l1 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(13, -1, 13, -1, 13, -1, 13, -1, - 12, -1, 12, -1, 12, -1, 12, -1, - 9, -1, 9, -1, 9, -1, 9, -1, - 8, -1, 8, -1, 8, -1, 8, -1)); - __m256i xq8h1 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(15, -1, 15, -1, 15, -1, 15, -1, - 14, -1, 14, -1, 14, -1, 14, -1, - 11, -1, 11, -1, 11, -1, 11, -1, - 10, -1, 10, -1, 10, -1, 10, -1)); - __m256i shift = _mm256_set_epi16(64, 16, 4, 1, - 64, 16, 4, 1, - 64, 16, 4, 1, - 64, 16, 4, 1); - xq8l0 = _mm256_mullo_epi16(xq8l0, shift); - xq8h0 = _mm256_mullo_epi16(xq8h0, shift); - xq8l1 = _mm256_mullo_epi16(xq8l1, shift); - xq8h1 = _mm256_mullo_epi16(xq8h1, shift); - xq8l0 = _mm256_srai_epi16(xq8l0, 14); - xq8h0 = _mm256_srai_epi16(xq8h0, 14); - xq8l1 = _mm256_srai_epi16(xq8l1, 14); - xq8h1 = _mm256_srai_epi16(xq8h1, 14); - __m256i xq8_0 = _mm256_packs_epi16(xq8l0, xq8h0); - __m256i xq8_1 = _mm256_packs_epi16(xq8l1, xq8h1); - - __m256i yq8_0 = _mm256_loadu_si256((const __m256i *) (y[i + 0].qs)); - __m256i yq8_1 = _mm256_loadu_si256((const __m256i *) (y[i + 1].qs)); - - const __m256 q0 = mul_sum_i8_pairs_float(xq8_0, yq8_0); - const __m256 q1 = mul_sum_i8_pairs_float(xq8_1, yq8_1); - - acc = _mm256_fmadd_ps( d0, q0, acc ); - acc = _mm256_fmadd_ps( d1, q1, acc ); - } - - for (int i = nb - leftovers; i < nb; ++i) { + for (int i = 0; i < nb; ++i) { const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i].d) ); - __m128i xq8b = _mm_loadu_si64(x[i].qs); - __m256i xq8 = MM256_SET_M128I(xq8b, xq8b); - __m256i xq8l = _mm256_shuffle_epi8(xq8, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, - 4, -1, 4, -1, 4, -1, 4, -1, - 1, -1, 1, -1, 1, -1, 1, -1, - 0, -1, 0, -1, 0, -1, 0, -1)); - __m256i xq8h = _mm256_shuffle_epi8(xq8, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, - 6, -1, 6, -1, 6, -1, 6, -1, - 3, -1, 3, -1, 3, -1, 3, -1, - 2, -1, 2, -1, 2, -1, 2, -1)); - __m256i shift = _mm256_set_epi16(64, 16, 4, 1, - 64, 16, 4, 1, - 64, 16, 4, 1, - 64, 16, 4, 1); - xq8l = _mm256_mullo_epi16(xq8l, shift); - xq8h = _mm256_mullo_epi16(xq8h, shift); - xq8l = _mm256_srai_epi16(xq8l, 14); - xq8h = _mm256_srai_epi16(xq8h, 14); - xq8 = _mm256_packs_epi16(xq8l, xq8h); - - __m256i yq8 = _mm256_loadu_si256((const __m256i *) (y[i].qs)); + // assuming this is always aligned + __m256i xq8 = _mm256_set1_epi64x(*(const int64_t *) x[i].qs); + xq8 = _mm256_srlv_epi64(xq8, _mm256_set_epi64x(6, 4, 2, 0)); + xq8 = _mm256_and_si256(xq8, _mm256_set1_epi8(0x03)); + // stangely enough, this is much slower with 1 instead of 2 + xq8 = _mm256_sub_epi8(xq8, _mm256_set1_epi8(2)); + + const __m256i yq8 = _mm256_loadu_si256((const __m256i *) (y[i].qs)); const __m256 q = mul_sum_i8_pairs_float(xq8, yq8); acc = _mm256_fmadd_ps( d, q, acc ); @@ -3964,11 +3949,11 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; i++) { int sumi = 0; for (int j = 0; j < qk / 4; j++) { - const int8_t* weight = (const int8_t *)(q22_grid + x[i].qs[j]); - sumi += (int)y[i].qs[4*j+0] * weight[0]; - sumi += (int)y[i].qs[4*j+1] * weight[1]; - sumi += (int)y[i].qs[4*j+2] * weight[2]; - sumi += (int)y[i].qs[4*j+3] * weight[3]; + const uint8_t weight = x[i].qs[j]; + sumi += (int)y[i].qs[j + 0*qk/4] * ((weight >> 0) & 3) - 2; + sumi += (int)y[i].qs[j + 1*qk/4] * ((weight >> 2) & 3) - 2; + sumi += (int)y[i].qs[j + 2*qk/4] * ((weight >> 4) & 3) - 2; + sumi += (int)y[i].qs[j + 3*qk/4] * ((weight >> 6) & 3) - 2; } sumf += (float)(sumi)*(GGML_FP16_TO_FP32(y[i].d)); } From 638ad52f87345b02968664cad56ecb800b567baa Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sun, 23 Jun 2024 19:44:09 -0400 Subject: [PATCH 05/28] ggml-quants : cleanup Q1_3 code formatting --- ggml/src/ggml-quants.c | 69 +++++++++--------------------------------- 1 file changed, 15 insertions(+), 54 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 4d7c6ff61ea52..927737fa09f15 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3424,48 +3424,6 @@ void dequantize_row_q1_3(const block_q1_3 * restrict x, float * restrict y, int6 const int64_t nb = k / QK1_3; static_assert(sizeof(x->q) % 4 == 0, "bad block_q1_3.q size"); -// #if defined(__SSE2__) -// __m128 vscale = _mm_set1_ps(scale); - -// for (int64_t i = 0; i < nb; ++i) { -// for (size_t j = 0; j < sizeof(x->q); j += 4) { -// __m128 q1 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 0]])); -// __m128 q2 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 1]])); -// __m128 q3 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 2]])); -// __m128 q4 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 3]])); -// q1 = _mm_mul_ps(q1, vscale); -// q2 = _mm_mul_ps(q2, vscale); -// q3 = _mm_mul_ps(q3, vscale); -// q4 = _mm_mul_ps(q4, vscale); - -// _mm_store_ps(y + 0, q1); -// _mm_store_ps(y + 4, q2); -// _mm_store_ps(y + 8, q3); -// _mm_store_ps(y + 12, q4); -// y += 16; -// } - -// for (size_t j = 0; j < sizeof(x->q); j += 4) { -// __m128i q5i = _mm_loadu_si32(x[i].q + j); -// q5i = _mm_cvtepi8_epi16(q5i); -// q5i = _mm_add_epi16(q5i, _mm_add_epi16(q5i, q5i)); -// q5i = _mm_srli_epi16(q5i, 8); -// q5i = _mm_sub_epi16(q5i, _mm_set1_epi16(1)); -// __m128 q5 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(q5i)); -// q5 = _mm_mul_ps(q5, vscale); - -// _mm_store_ps(y, q5); -// y += 4; -// } - -// for (size_t j = 0; j < sizeof(x->qs); ++j) { -// __m128 q = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].qs[j]])); -// q = _mm_mul_ps(q, vscale); -// _mm_store_ps(y, q); -// y += 4; -// } -// } -// #else for (int64_t i = 0; i < nb; ++i) { for (size_t j = 0; j < sizeof(x->q); ++j) { const int8_t * q = (const int8_t *) (q1_3_grid + x[i].q[j]); @@ -3486,7 +3444,6 @@ void dequantize_row_q1_3(const block_q1_3 * restrict x, float * restrict y, int6 } } } -// #endif } // ====================== "True" 2-bit (de)-quantization @@ -11356,14 +11313,15 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - // const __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1)); - // const __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12); + // const __m128i x12a = _mm_maskload_epi32((const int32_t *) x, _mm_set_epi32(0, -1, -1, -1)); + // const __m128i x12b = _mm_insert_epi8(x12a, x->qs[0], 12); // WARNING: reading 3 bytes further than necessary. // It's measurably faster than a masked load on an Intel Core m3-8100Y - const __m128i x12b = _mm_loadu_si128((const __m128i_u *) (x[i].q)); + const __m128i x12b = _mm_loadu_si128((const __m128i_u *) x); const __m256i x12 = MM256_SET_M128I(x12b, x12b); { + // pre-shift the values by 8 bits, and prepare the layout for later packing __m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, 4, -1, 4, -1, 4, -1, 4, -1, 1, -1, 1, -1, 1, -1, 1, -1, @@ -11384,8 +11342,8 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r 3, 9, 27, 81, 3, 9, 27, 81, 3, 9, 27, 81); - const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1, - 1, 1, 1, 1, + const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1, + 1, 1, 1, 1, 3, 9, 27, 81, 3, 9, 27, 81); const __m256i shift1h = _mm256_set_epi16(3, 9, 27, 81, @@ -11409,18 +11367,21 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r x0 = _mm256_sub_epi8(x0, _mm256_set1_epi8(1)); x1 = _mm256_sub_epi8(x1, _mm256_set1_epi8(1)); - const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 0].qs)); - const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 1].qs)); + const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) (y[0].qs)); + const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) (y[1].qs)); - const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 0].d)); - const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d)); + const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)); + const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)); - const __m256 q0 = mul_sum_i8_pairs_float(x0, y0); - const __m256 q1 = mul_sum_i8_pairs_float(x1, y1); + const __m256 q0 = mul_sum_i8_pairs_float(y0, x0); + const __m256 q1 = mul_sum_i8_pairs_float(y1, x1); accumf = _mm256_fmadd_ps(d0, q0, accumf); accumf = _mm256_fmadd_ps(d1, q1, accumf); } + + x += 1; + y += 2; } *s = hsum_float_8(accumf); From 9465ec6e12be0498f409af7d8d5e978403058d9a Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Tue, 25 Jun 2024 01:32:14 -0400 Subject: [PATCH 06/28] ggml-quants : ARM NEON vec_dot for q2_2 and q1_3 --- ggml/src/ggml-quants.c | 188 +++++++++++++++++++++++++++++++++++++---- ggml/src/ggml.c | 12 +++ 2 files changed, 182 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 927737fa09f15..14a1ee4e97e8e 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -686,6 +686,10 @@ void quantize_row_q2_2_reference(const float * restrict x, block_q2_2 * restrict } } +void quantize_row_q2_2(const float * restrict x, void * restrict y, int64_t k) { + quantize_row_q2_2_reference(x, y, k); +} + // reference implementation for deterministic creation of model files void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) { static const int qk = QK4_0; @@ -3900,17 +3904,81 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r } *s = hsum_float_8(acc); +#elif defined(__ARM_NEON) + float sumf0 = 0.0f; + float sumf1 = 0.0f; + + const uint8x8_t mask = vdup_n_u8(3); + const int8x8_t offset = vdup_n_s8(2); + + const int leftovers = nb % 2; + + for (int i = 0; i < nb - leftovers; i += 2) { + const uint8x8_t xq8_0 = vld1_u8(x[0].qs); + const uint8x8_t xq8_1 = vld1_u8(x[1].qs); + + const int8x8_t xq8_0_0 = vsub_s8(vreinterpret_s8_u8(vand_u8(xq8_0, mask)), offset); + const int8x8_t xq8_0_1 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_0, 2), mask)), offset); + const int8x8_t xq8_0_2 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_0, 4), mask)), offset); + const int8x8_t xq8_0_3 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_0, 6), mask)), offset); + const int8x8_t xq8_1_0 = vsub_s8(vreinterpret_s8_u8(vand_u8(xq8_1, mask)), offset); + const int8x8_t xq8_1_1 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_1, 2), mask)), offset); + const int8x8_t xq8_1_2 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_1, 4), mask)), offset); + const int8x8_t xq8_1_3 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_1, 6), mask)), offset); + + const int8x16_t xq8_0_l = vcombine_s8(xq8_0_0, xq8_0_1); + const int8x16_t xq8_0_h = vcombine_s8(xq8_0_2, xq8_0_3); + const int8x16_t xq8_1_l = vcombine_s8(xq8_1_0, xq8_1_1); + const int8x16_t xq8_1_h = vcombine_s8(xq8_1_2, xq8_1_3); + + const int8x16_t yq8_0_l = vld1q_s8(y[0].qs + 0); + const int8x16_t yq8_0_h = vld1q_s8(y[0].qs + 16); + const int8x16_t yq8_1_l = vld1q_s8(y[1].qs + 0); + const int8x16_t yq8_1_h = vld1q_s8(y[1].qs + 16); + + const int16x8_t dot0 = vaddq_s16(vpaddlq_s8(vmulq_s8(xq8_0_l, yq8_0_l)), vpaddlq_s8(vmulq_s8(xq8_0_h, yq8_0_h))); + const int16x8_t dot1 = vaddq_s16(vpaddlq_s8(vmulq_s8(xq8_1_l, yq8_1_l)), vpaddlq_s8(vmulq_s8(xq8_1_h, yq8_1_h))); + + sumf0 += GGML_FP16_TO_FP32(y[0].d) * (float) vaddlvq_s16(dot0); + sumf1 += GGML_FP16_TO_FP32(y[1].d) * (float) vaddlvq_s16(dot1); + x += 2; + y += 2; + } + + // one block at a time + for (int i = nb - leftovers; i < nb; ++i) { + const uint8x8_t xq8 = vld1_u8(x->qs); + const int8x8_t xq8_0 = vsub_s8(vreinterpret_s8_u8(vand_u8(xq8, mask)), offset); + const int8x8_t xq8_1 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8, 2), mask)), offset); + const int8x8_t xq8_2 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8, 4), mask)), offset); + const int8x8_t xq8_3 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8, 6), mask)), offset); + + const int8x16_t xq8_l = vcombine_s8(xq8_0, xq8_1); + const int8x16_t xq8_h = vcombine_s8(xq8_2, xq8_3); + + const int8x16_t yq8_l = vld1q_s8(y->qs + 0); + const int8x16_t yq8_h = vld1q_s8(y->qs + 16); + + const int16x8_t dot0 = vpaddlq_s8(vmulq_s8(xq8_l, yq8_l)); + const int16x8_t dot1 = vpaddlq_s8(vmulq_s8(xq8_h, yq8_h)); + + sumf0 += GGML_FP16_TO_FP32(y->d) * (float) vaddlvq_s16(vaddq_s16(dot0, dot1)); + x += 1; + y += 1; + } + + *s = sumf0 + sumf1; #else - float sumf = 0.0; + float sumf = 0.0f; for (int i = 0; i < nb; i++) { int sumi = 0; for (int j = 0; j < qk / 4; j++) { const uint8_t weight = x[i].qs[j]; - sumi += (int)y[i].qs[j + 0*qk/4] * ((weight >> 0) & 3) - 2; - sumi += (int)y[i].qs[j + 1*qk/4] * ((weight >> 2) & 3) - 2; - sumi += (int)y[i].qs[j + 2*qk/4] * ((weight >> 4) & 3) - 2; - sumi += (int)y[i].qs[j + 3*qk/4] * ((weight >> 6) & 3) - 2; + sumi += (int)y[i].qs[j + 0*qk/4] * (((weight >> 0) & 3) - 2); + sumi += (int)y[i].qs[j + 1*qk/4] * (((weight >> 2) & 3) - 2); + sumi += (int)y[i].qs[j + 2*qk/4] * (((weight >> 4) & 3) - 2); + sumi += (int)y[i].qs[j + 3*qk/4] * (((weight >> 6) & 3) - 2); } sumf += (float)(sumi)*(GGML_FP16_TO_FP32(y[i].d)); } @@ -11314,27 +11382,27 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { // const __m128i x12a = _mm_maskload_epi32((const int32_t *) x, _mm_set_epi32(0, -1, -1, -1)); - // const __m128i x12b = _mm_insert_epi8(x12a, x->qs[0], 12); + // const __m128i x13b = _mm_insert_epi8(x12a, x->qs[0], 12); // WARNING: reading 3 bytes further than necessary. // It's measurably faster than a masked load on an Intel Core m3-8100Y - const __m128i x12b = _mm_loadu_si128((const __m128i_u *) x); - const __m256i x12 = MM256_SET_M128I(x12b, x12b); + const __m128i x13b = _mm_loadu_si128((const __m128i_u *) x); + const __m256i x13 = MM256_SET_M128I(x13b, x13b); { // pre-shift the values by 8 bits, and prepare the layout for later packing - __m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, + __m256i x0l = _mm256_shuffle_epi8(x13, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, 4, -1, 4, -1, 4, -1, 4, -1, 1, -1, 1, -1, 1, -1, 1, -1, 0, -1, 0, -1, 0, -1, 0, -1)); - __m256i x0h = _mm256_shuffle_epi8(x12, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, + __m256i x0h = _mm256_shuffle_epi8(x13, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, 6, -1, 6, -1, 6, -1, 6, -1, 3, -1, 3, -1, 3, -1, 3, -1, 2, -1, 2, -1, 2, -1, 2, -1)); - __m256i x1l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(7, -1, 6, -1, 5, -1, 4, -1, + __m256i x1l = _mm256_shuffle_epi8(x13, _mm256_set_epi8(7, -1, 6, -1, 5, -1, 4, -1, 3, -1, 2, -1, 1, -1, 0, -1, 9, -1, 9, -1, 9, -1, 9, -1, 8, -1, 8, -1, 8, -1, 8, -1)); - __m256i x1h = _mm256_shuffle_epi8(x12, _mm256_set_epi8(12, -1, 12, -1, 12, -1, 12, -1, + __m256i x1h = _mm256_shuffle_epi8(x13, _mm256_set_epi8(12, -1, 12, -1, 12, -1, 12, -1, 11, -1, 10, -1, 9, -1, 8, -1, 11, -1, 11, -1, 11, -1, 11, -1, 10, -1, 10, -1, 10, -1, 10, -1)); @@ -11385,6 +11453,88 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r } *s = hsum_float_8(accumf); +#elif defined(__ARM_NEON) + + static const uint8_t k_mask0[16] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; + static const uint8_t k_mask1[16] = {4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7}; + static const uint8_t k_mask2[16] = {8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11}; + static const uint8_t k_mask3[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12}; + + static const uint8_t k_shift0[16] = {81, 27, 9, 3, 81, 27, 9, 3, 81, 27, 9, 3, 81, 27, 9, 3}; + static const uint8_t k_shift3[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 81, 27, 9, 3}; + + // float32x4_t sumv0 = vdupq_n_f32(0.0f); + // float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float sumf0 = 0.0f; + float sumf1 = 0.0f; + + const uint8x16_t mask0 = vld1q_u8(k_mask0); + const uint8x16_t mask1 = vld1q_u8(k_mask1); + const uint8x16_t mask2 = vld1q_u8(k_mask2); + const uint8x16_t mask3 = vld1q_u8(k_mask3); + + const uint8x16_t shift0 = vld1q_u8(k_shift0); + const uint8x16_t shift3 = vld1q_u8(k_shift3); + + const int8x16_t one = vdupq_n_s8(1); + + for (int i = 0; i < nb; ++i) { + // WARNING: reading 3 bytes further than necessary + const uint8x16_t x13b = vld1q_u8((const uint8_t *) x); + + uint8x16_t x0 = vqtbl1q_u8(x13b, mask0); + uint8x16_t x1 = vqtbl1q_u8(x13b, mask1); + uint8x16_t x2 = vqtbl1q_u8(x13b, mask2); + uint8x16_t x3 = vqtbl1q_u8(x13b, mask3); + + x0 = vmulq_u8(x0, shift0); + x1 = vmulq_u8(x1, shift0); + x2 = vmulq_u8(x2, shift0); + x3 = vmulq_u8(x3, shift3); + + // multiply by 3 and keep the 2 bits above 8 bits + x0 = vshrq_n_u8(vhaddq_u8(x0, vshrq_n_u8(x0, 1)), 6); + x1 = vshrq_n_u8(vhaddq_u8(x1, vshrq_n_u8(x1, 1)), 6); + x2 = vshrq_n_u8(vhaddq_u8(x2, vshrq_n_u8(x2, 1)), 6); + x3 = vshrq_n_u8(vhaddq_u8(x3, vshrq_n_u8(x3, 1)), 6); + + // 0, 1, 2 => -1, 0, 1 + int8x16_t x0i = vsubq_s8(vreinterpretq_s8_u8(x0), one); + int8x16_t x1i = vsubq_s8(vreinterpretq_s8_u8(x1), one); + int8x16_t x2i = vsubq_s8(vreinterpretq_s8_u8(x2), one); + int8x16_t x3i = vsubq_s8(vreinterpretq_s8_u8(x3), one); + + const int8x16_t y0 = vld1q_s8(y[0].qs + 0); + const int8x16_t y1 = vld1q_s8(y[0].qs + 16); + const int8x16_t y2 = vld1q_s8(y[1].qs + 0); + const int8x16_t y3 = vld1q_s8(y[1].qs + 16); + + // const int32x4_t p0 = vpaddlq_s16(vaddq_s16(vpaddlq_s8(x0i), vpaddlq_s8(x1i))); + // const int32x4_t p1 = vpaddlq_s16(vaddq_s16(vpaddlq_s8(x2i), vpaddlq_s8(x3i))); + + // there's no direct equivalent to _mm_sign_epi8, unfortunately + x0i = vmulq_s8(x0i, y0); + x1i = vmulq_s8(x1i, y1); + x2i = vmulq_s8(x2i, y2); + x3i = vmulq_s8(x3i, y3); + + // overall 18.5% faster than with vector sums on a cortex-A72 + sumf0 += GGML_FP16_TO_FP32(y[0].d) * (float) vaddlvq_s16(vaddq_s16(vpaddlq_s8(x0i), vpaddlq_s8(x1i))); + sumf1 += GGML_FP16_TO_FP32(y[1].d) * (float) vaddlvq_s16(vaddq_s16(vpaddlq_s8(x2i), vpaddlq_s8(x3i))); + + // const int32x4_t p0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), x0i, y0), x1i, y1); + // const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), x2i, y2), x3i, y3); + + // sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p0), GGML_FP16_TO_FP32(y[0].d)); + // sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p1), GGML_FP16_TO_FP32(y[1].d)); + + y += 2; + x += 1; + } + + // *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); + *s = sumf0 + sumf1; #else float sumf = 0.0f; @@ -11393,34 +11543,36 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r for (int j = 0; j < 8; ++j) { const int8_t * xj = (const int8_t *) (q1_3_grid + x[i].q[j]); for (int k = 0; k < 4; ++k) { - sum += xj[k] * (int16_t) y[2*i].qs[4*j + k]; + sum += xj[k] * (int16_t) y->qs[4*j + k]; } } - sumf += GGML_FP16_TO_FP32(y[2*i].d) * sum; + sumf += GGML_FP16_TO_FP32(y->d) * sum; + y += 1; sum = 0; for (int j = 0; j < 4; ++j) { const int8_t * xj = (const int8_t *) (q1_3_grid + x[i].q[8 + j]); for (int k = 0; k < 4; ++k) { - sum += xj[k] * (int16_t) y[2*i + 1].qs[4*j + k]; + sum += xj[k] * (int16_t) y->qs[4*j + k]; } } for (size_t j = 0; j < 12; ++j) { uint16_t xj = x[i].q[j]; xj = (xj * 3) >> 8; - sum += ((int16_t) xj - 1) * (int16_t) y[2*i + 1].qs[16 + j]; + sum += ((int16_t) xj - 1) * (int16_t) y->qs[16 + j]; } { const int8_t * xj = (const int8_t *) (q1_3_grid + x[i].qs[0]); for (int k = 0; k < 4; ++k) { - sum += (int16_t) xj[k] * (int16_t) y[2*i + 1].qs[28 + k]; + sum += (int16_t) xj[k] * (int16_t) y->qs[28 + k]; } } - sumf += GGML_FP16_TO_FP32(y[2*i + 1].d) * sum; + sumf += GGML_FP16_TO_FP32(y->d) * sum; + y += 1; } *s = sumf; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 8c444d0b6f67f..a3a062ed8dabf 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -823,6 +823,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_Q2_2] = { + .type_name = "q2_2", + .blck_size = QK2_2, + .type_size = sizeof(block_q2_2), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_2, + .from_float = quantize_row_q2_2, + .from_float_reference = (ggml_from_float_t) quantize_row_q2_2_reference, + .vec_dot = ggml_vec_dot_q2_2_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, [GGML_TYPE_Q1_3] = { .type_name = "q1_3", .blck_size = QK1_3, From 89dc3b254cb85d879559007a4a5d1a5cce62822f Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 26 Jun 2024 15:31:48 -0400 Subject: [PATCH 07/28] ggml-quants : use ceiling division when quantizing q1_3 --- convert-hf-to-gguf.py | 2 +- ggml/src/ggml-quants.c | 8 ++++---- gguf-py/gguf/quants.py | 3 +-- tests/test-quantize-fns.cpp | 6 ++++++ 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a8aef09b93369..ec66316ee7804 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -336,7 +336,7 @@ def write_tensors(self): shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" + shape_str = f"{{{', '.join(str(n) for n in reversed(shape)) or '1'}}}" # n_dims is implicit in the shape logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 14a1ee4e97e8e..5dd682b602d56 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3389,8 +3389,8 @@ void quantize_row_q1_3_reference(const float * restrict x, block_q1_3 * restrict int xi = nearest_int(x[j]); uint8_t xt = xi < 0 ? 0 : xi == 0 ? 1 : 2; q[j] += xt * pow3[4]; - q[j] = ((uint16_t)q[j] * 256) / pow3[5]; - q[j] += (uint8_t)(q[j] != 0); + // ceiling division + q[j] = ((uint16_t)q[j] * 256 + (pow3[5] - 1)) / pow3[5]; y[i].q[j] = q[j]; } x += sizeof(y->q); @@ -3403,8 +3403,8 @@ void quantize_row_q1_3_reference(const float * restrict x, block_q1_3 * restrict qb += xt * pow3[m]; } x += 4; - qb = ((uint16_t)qb * 256) / pow3[5]; - qb += (uint8_t)(qb != 0); + // ceiling division + qb = ((uint16_t)qb * 256 + (pow3[5] - 1)) / pow3[5]; y[i].qs[j] = qb; } } diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 46820dce3b288..c66b83b3f8283 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -149,8 +149,7 @@ def __quantize_q1_3_rows(n: np.ndarray) -> np.ndarray: q4 = np.sum(q4 * pow3.reshape((1, 4)), axis=1, keepdims=True) q48 = q48 + (q12 * 81) q = np.concatenate([q48, q4], axis=1) - q = ((q.astype(np.uint16) * 256) // 243).astype(np.uint8) - q = np.where(q != 0, q + 1, 0) + q = (((q.astype(np.uint16) * 256) + (243 - 1)) // 243).astype(np.uint8) return q.reshape(__quantize_q1_3_shape_change(shape)) diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index e690ac6c85a71..d977aa26bc00e 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -15,11 +15,13 @@ constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; +constexpr float MAX_QUANTIZATION_TOTAL_ERROR_BITNET = 0.015625f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f; constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f; constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f; +constexpr float MAX_DOT_PRODUCT_ERROR_BITNET = 0.5f; static const char* RESULT_STR[] = {"ok", "FAILED"}; @@ -144,6 +146,8 @@ int main(int argc, char * argv[]) { if (qfns.from_float && qfns.to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = + type == GGML_TYPE_Q1_3 ? MAX_QUANTIZATION_TOTAL_ERROR_BITNET : + type == GGML_TYPE_Q2_2 ? MAX_QUANTIZATION_TOTAL_ERROR_BITNET : type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : @@ -166,6 +170,8 @@ int main(int argc, char * argv[]) { const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT + : type == GGML_TYPE_Q2_2 || type == GGML_TYPE_Q1_3 + ? MAX_DOT_PRODUCT_ERROR_BITNET : MAX_DOT_PRODUCT_ERROR; failed = !(vec_dot_error < max_allowed_error); num_failed += failed; From 961e2938333ce6e1fa723a7be09e984093950864 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 26 Jun 2024 16:24:40 -0400 Subject: [PATCH 08/28] convert-hf : simplify BitNet pre-quantization This still results in the exact same tensor weights and scales, but it reveals some weirdness in the current algorithm. --- convert-hf-to-gguf.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ec66316ee7804..9f4094194b4a3 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -265,7 +265,10 @@ def write_tensors(self): break for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): - data: np.ndarray = data # type hint + data: np.ndarray # type hint + if len(data.shape) == 0: + # otherwise single-value tensors get squeezed + data = data.reshape((1,)) n_dims = len(data.shape) data_dtype = data.dtype data_qtype: gguf.GGMLQuantizationType | None = None @@ -336,7 +339,7 @@ def write_tensors(self): shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(shape)) or '1'}}}" + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" # n_dims is implicit in the shape logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") @@ -1446,12 +1449,13 @@ def set_gguf_parameters(self): def weight_quant(self, weight): dtype = weight.dtype weight = weight.float() - s = 1 / weight.abs().mean().clamp(min=1e-5) - weight = (weight * s).round().clamp(-1, 1) / s - scale = weight.abs().max().unsqueeze(0) - weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype) - weight = torch.sign(weight).type(dtype) - return weight.type(dtype), scale.type(torch.float32) + scale = weight.abs().mean().clamp(min=1e-5) + iscale = 1 / scale + weight = (weight * iscale).round().clamp(-1, 1) + # TODO: use the scale directly instead of inverting it twice + # (this is also unnecessarily doubly inverted upstream) + # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 + return weight.type(dtype), (1 / iscale).type(torch.float32) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) From 0996149911458ce9821aa49e10db4e7c1187486d Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 26 Jun 2024 22:10:12 -0400 Subject: [PATCH 09/28] convert-hf : allow converting the weird BitNet 1.3B Its FFN size is 5460 which is not convenient. The offending tensors are kept in F16, which makes the final model 5.01 bpw. --- convert-hf-to-gguf.py | 16 ++++++++++------ gguf-py/gguf/quants.py | 4 ++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 9f4094194b4a3..2bf0967ce4f91 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -301,12 +301,16 @@ def write_tensors(self): if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: # TODO: cleaner model-specific per-tensor types # NOTE: Q1_3 is only relevant for BitNet 1.58b - if self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 and not any( - self.match_model_tensor_name(new_name, key, None) - for key in [ - gguf.MODEL_TENSOR.TOKEN_EMBD, - gguf.MODEL_TENSOR.OUTPUT, - ] + if ( + self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 + and gguf.can_quantize_to_q1_3(data) + and not any( + self.match_model_tensor_name(new_name, key, None) + for key in [ + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + ] + ) ): data = gguf.quantize_q1_3(data) assert data.dtype == np.uint8 diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index c66b83b3f8283..c96e6a34361e4 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -126,6 +126,10 @@ def quantize_q8_0(data: np.ndarray): __q1_3_block_size, __q1_3_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q1_3] +def can_quantize_to_q1_3(n: np.ndarray) -> bool: + return n.shape[-1] % __q1_3_block_size == 0 + + def __quantize_q1_3_shape_change(s: tuple[int, ...]) -> tuple[int, ...]: return (*s[:-1], s[-1] // __q1_3_block_size * __q1_3_type_size) From bfd2f21fb43525a8757a8c9e44032fd14bac222b Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 28 Jun 2024 20:38:12 -0400 Subject: [PATCH 10/28] bitnet : replace 1.58b with b1.58, as in the paper --- convert-hf-to-gguf.py | 2 +- examples/quantize/quantize.cpp | 4 ++-- ggml/src/ggml-common.h | 2 +- ggml/src/ggml-quants.c | 2 +- src/llama.cpp | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 2bf0967ce4f91..eb5aaebac63af 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -300,7 +300,7 @@ def write_tensors(self): if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: # TODO: cleaner model-specific per-tensor types - # NOTE: Q1_3 is only relevant for BitNet 1.58b + # NOTE: Q1_3 is only relevant for BitNet b1.58 if ( self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 and gguf.can_quantize_to_q1_3(data) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 43241df6087c7..aed39a4d00777 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,8 +26,8 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, - { "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet 1.58b", }, - { "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet 1.58b", }, + { "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet b1.58", }, + { "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet b1.58", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 9c680e3b1c05f..71901565158fa 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -137,7 +137,7 @@ typedef sycl::half2 ggml_half2; #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP -// 1.625 bpw for BitNet 1.58b models +// 1.625 bpw for BitNet b1.58 models #define QK1_3 64 typedef struct { uint8_t q[(QK1_3 - 4*QK1_3/64)/5]; // 5 elements per byte (3^5 = 243 < 256) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 5dd682b602d56..e1197f4733b51 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3366,7 +3366,7 @@ size_t quantize_q2_2(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -// ====================== 1.625 bpw (de)-quantization (BitNet 1.58b) +// ====================== 1.625 bpw (de)-quantization (BitNet b1.58) void quantize_row_q1_3_reference(const float * restrict x, block_q1_3 * restrict y, int64_t k) { assert(k % QK1_3 == 0); diff --git a/src/llama.cpp b/src/llama.cpp index fa2d97e65d472..750455e33509f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4186,8 +4186,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "F16"; case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; - case LLAMA_FTYPE_MOSTLY_Q1_3: return "Q1_3 - 1.625 bpw for BitNet 1.58b"; - case LLAMA_FTYPE_MOSTLY_Q2_2: return "Q2_2 - 2.000 bpw for BitNet 1.58b"; + case LLAMA_FTYPE_MOSTLY_Q1_3: return "Q1_3 - 1.625 bpw for BitNet b1.58"; + case LLAMA_FTYPE_MOSTLY_Q2_2: return "Q2_2 - 2.000 bpw for BitNet b1.58"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: From ec50944bf6a7460b965ce2cf669c59822ea73296 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 28 Jun 2024 20:41:13 -0400 Subject: [PATCH 11/28] ggml-quants : fix build failure on Windows --- ggml/src/ggml-quants.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index e1197f4733b51..1f7460ac443e9 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -11385,7 +11385,7 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r // const __m128i x13b = _mm_insert_epi8(x12a, x->qs[0], 12); // WARNING: reading 3 bytes further than necessary. // It's measurably faster than a masked load on an Intel Core m3-8100Y - const __m128i x13b = _mm_loadu_si128((const __m128i_u *) x); + const __m128i x13b = _mm_loadu_si128((const __m128i *) x); const __m256i x13 = MM256_SET_M128I(x13b, x13b); { @@ -11435,8 +11435,8 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r x0 = _mm256_sub_epi8(x0, _mm256_set1_epi8(1)); x1 = _mm256_sub_epi8(x1, _mm256_set1_epi8(1)); - const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) (y[0].qs)); - const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) (y[1].qs)); + const __m256i y0 = _mm256_loadu_si256((const __m256i *) (y[0].qs)); + const __m256i y1 = _mm256_loadu_si256((const __m256i *) (y[1].qs)); const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)); const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)); From 8fbd59308b54729a191dcf3aee3388abfa7dd6e3 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 28 Jun 2024 22:52:57 -0400 Subject: [PATCH 12/28] ggml-quants : attempt to fix Arm 32-bit support --- ggml/src/ggml-impl.h | 11 ++++------- ggml/src/ggml-quants.c | 8 ++++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 1d23361906c34..374b5ae04f037 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -177,7 +177,7 @@ typedef __fp16 ggml_fp16_internal_t; // 32-bit ARM compatibility -// vaddvq_s16 +// vaddlvq_s16 // vpaddq_s16 // vpaddq_s32 // vaddvq_s32 @@ -187,12 +187,9 @@ typedef __fp16 ggml_fp16_internal_t; // vzip1_u8 // vzip2_u8 -inline static int32_t vaddvq_s16(int16x8_t v) { - return - (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + - (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + - (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + - (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +inline static int32_t vaddlvq_s16(int16x8_t v) { + int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); + return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); } inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 1f7460ac443e9..df4320f5b1179 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -11483,10 +11483,10 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r // WARNING: reading 3 bytes further than necessary const uint8x16_t x13b = vld1q_u8((const uint8_t *) x); - uint8x16_t x0 = vqtbl1q_u8(x13b, mask0); - uint8x16_t x1 = vqtbl1q_u8(x13b, mask1); - uint8x16_t x2 = vqtbl1q_u8(x13b, mask2); - uint8x16_t x3 = vqtbl1q_u8(x13b, mask3); + uint8x16_t x0 = ggml_vqtbl1q_u8(x13b, mask0); + uint8x16_t x1 = ggml_vqtbl1q_u8(x13b, mask1); + uint8x16_t x2 = ggml_vqtbl1q_u8(x13b, mask2); + uint8x16_t x3 = ggml_vqtbl1q_u8(x13b, mask3); x0 = vmulq_u8(x0, shift0); x1 = vmulq_u8(x1, shift0); From dd3e62a703c1ec8da6426d215cb02686d88e679e Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sun, 28 Jul 2024 21:17:16 -0400 Subject: [PATCH 13/28] ggml : add some informative comments in q1_3 vec_dot --- ggml/src/ggml-quants.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index df4320f5b1179..27831a380d22c 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -11419,10 +11419,12 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r 3, 9, 27, 81, 3, 9, 27, 81); // extract ternary values + // first by shifting the numbers to make each one the next significant digit x0l = _mm256_mullo_epi16(x0l, shift0); x0h = _mm256_mullo_epi16(x0h, shift0); x1l = _mm256_mullo_epi16(x1l, shift1l); x1h = _mm256_mullo_epi16(x1h, shift1h); + // then by extracting each of these most significant digits x0l = _mm256_mulhi_epu16(x0l, _mm256_set1_epi16(3)); x0h = _mm256_mulhi_epu16(x0h, _mm256_set1_epi16(3)); x1l = _mm256_mulhi_epu16(x1l, _mm256_set1_epi16(3)); From 77b8f84ae7d57fd18a70b07640e53d4c0bbc563b Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Tue, 30 Jul 2024 17:55:54 -0400 Subject: [PATCH 14/28] ggml : add TQ1_0 and TQ2_0 ternary quantization types --- examples/quantize/quantize.cpp | 2 + ggml/include/ggml.h | 6 +- ggml/src/ggml-common.h | 19 ++ ggml/src/ggml-quants.c | 463 ++++++++++++++++++++++++++++++++- ggml/src/ggml-quants.h | 15 ++ ggml/src/ggml.c | 36 ++- gguf-py/gguf/constants.py | 9 +- include/llama.h | 6 +- src/llama.cpp | 9 + tests/test-quantize-fns.cpp | 14 +- 10 files changed, 563 insertions(+), 16 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 88044e6226445..1086cc9ed086f 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,6 +26,8 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, + { "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", }, + { "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", }, { "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet b1.58", }, { "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet b1.58", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 6bc4be298b732..03884cba4df51 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -390,8 +390,10 @@ extern "C" { GGML_TYPE_Q4_0_4_4 = 31, GGML_TYPE_Q4_0_4_8 = 32, GGML_TYPE_Q4_0_8_8 = 33, - GGML_TYPE_Q2_2 = 34, - GGML_TYPE_Q1_3 = 35, + GGML_TYPE_TQ1_0 = 34, + GGML_TYPE_TQ2_0 = 35, + GGML_TYPE_Q2_2 = 36, + GGML_TYPE_Q1_3 = 37, GGML_TYPE_COUNT, }; diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index fbfc24d6cb74b..3be4dd4ca783d 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -241,6 +241,25 @@ typedef struct { } block_q8_0x8; static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding"); +// +// Ternary quantization +// + +// 1.6875 bpw +typedef struct { + uint8_t q[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256) + uint8_t qs[QK_K/64]; // 4 elements per byte + ggml_half d; +} block_tq1_0; +static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding"); + +// 2.0625 bpw +typedef struct { + uint8_t q[QK_K/4]; // 2 bits per element + ggml_half d; +} block_tq2_0; +static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding"); + // // Super-block quantization structures // diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 3e6231e7d7850..44506f731a607 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3366,7 +3366,190 @@ size_t quantize_q2_2(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -// ====================== 1.625 bpw (de)-quantization (BitNet b1.58) +// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs) + +void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + for (int64_t i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK_K; j++) { + const float v = x[j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + // 5 elements per byte, along 32 bytes + for (size_t j = 0; j < sizeof(y->q) - sizeof(y->q) % 32; j += 32) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = 0; + for (size_t n = 0; n < 5; ++n) { + int xi = nearest_int(x[m + n*32] * id) + 1; // -1, 0, 1 -> 0, 1, 2 + q *= 3; + q += xi; + } + // ceiling division (243 == pow(3, 5)) + q = ((uint16_t)q * 256 + (243 - 1)) / 243; + y[i].q[j + m] = q; + } + x += 5*32; + } + // along 16 bytes + for (size_t j = sizeof(y->q) - sizeof(y->q) % 32; j < sizeof(y->q); j += 16) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = 0; + for (size_t n = 0; n < 5; ++n) { + int xi = nearest_int(x[m + n*16] * id) + 1; // -1, 0, 1 -> 0, 1, 2 + q *= 3; + q += xi; + } + // ceiling division (243 == pow(3, 5)) + q = ((uint16_t)q * 256 + (243 - 1)) / 243; + y[i].q[j + m] = q; + } + x += 5*16; + } + // 4 elements per byte + for (size_t j = 0; j < sizeof(y->qs); ++j) { + uint8_t q = 0; + for (size_t m = 0; m < 4; ++m) { + // -1, 0, 1 -> 0, 1, 2 + int xi = nearest_int(x[j + m*sizeof(y->qs)] * id) + 1; + q *= 3; + q += xi; + } + // shift the first value to the most significant trit + q *= 3; + // ceiling division (243 == pow(3, 5)) + q = ((uint16_t)q * 256 + (243 - 1)) / 243; + y[i].qs[j] = q; + } + x += 4*sizeof(y->qs); + } +} + +void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + for (int64_t i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK_K; j++) { + const float v = x[j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + // TODO: should it be along 64 bytes instead for AVX512? + for (size_t j = 0; j < sizeof(y->q); j += 32) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = 0; + for (size_t n = 0; n < 4; ++n) { + // -1, 0, 1 -> 1, 2, 3 + int xi = nearest_int(x[m + n*32] * id) + 2; + q += (xi & 3) << (2*n); + } + y[i].q[j + m] = q; + } + x += 4*32; + } + } +} + +void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) { + assert(k % QK_K == 0); + block_tq1_0 * restrict y = vy; + quantize_row_tq1_0_ref(x, y, k); +} + +void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) { + assert(k % QK_K == 0); + block_tq2_0 * restrict y = vy; + quantize_row_tq2_0_ref(x, y, k); +} + +size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row); + quantize_row_tq1_0(src, dst, (int64_t)nrow*n_per_row); + return nrow * row_size; +} + +size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row); + quantize_row_tq2_0(src, dst, (int64_t)nrow*n_per_row); + return nrow * row_size; +} + + +void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; + + for (int64_t i = 0; i < nb; ++i) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (size_t j = 0; j < sizeof(x->q) - sizeof(x->q) % 32; j += 32) { + for (size_t n = 0; n < 5; ++n) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = x[i].q[j + m] * pow3[n]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + *y++ = (float) (xi - 1) * d; + } + } + } + for (size_t j = sizeof(x->q) - sizeof(x->q) % 32; j < sizeof(x->q); j += 16) { + for (size_t n = 0; n < 5; ++n) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = x[i].q[j + m] * pow3[n]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + *y++ = (float) (xi - 1) * d; + } + } + } + + for (size_t n = 0; n < 4; ++n) { + for (size_t j = 0; j < sizeof(x->qs); ++j) { + uint8_t q = x[i].qs[j] * pow3[n]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + *y++ = (float) (xi - 1) * d; + } + } + } +} + +void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + for (int64_t i = 0; i < nb; ++i) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (size_t j = 0; j < sizeof(x->q); j += 32) { + for (size_t l = 0; l < 4; ++l) { + for (size_t m = 0; m < 32; ++m) { + *y++ = (float) (((x[i].q[j + m] >> (l*2)) & 3) - 2) * d; + } + } + } + } +} void quantize_row_q1_3_ref(const float * restrict x, block_q1_3 * restrict y, int64_t k) { assert(k % QK1_3 == 0); @@ -5730,6 +5913,276 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r *s = sumf; } +void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq1_0 * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined __AVX2__ + __m256 sumf = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + // 16-bit sums + __m256i sumi0 = _mm256_setzero_si256(); + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + + // first 32 bytes of 5 elements + { + __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].q)); + // 8-bit multiplies with shifts, masks and adds + __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3 + __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9 + __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9 + __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9 + + // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits? + + // Cancel the +1 from avg so that it behaves like a halving add + qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1)); + qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1)); + qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1)); + qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1)); + qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1)); + // Multiply by 3 and get the top 2 bits + qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256())); + qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256())); + qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256())); + qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256())); + qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256())); + qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3)); + qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3)); + qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3)); + qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3)); + qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3)); + + // 0, 1, 2 => -1, 0, 1 + qx0 = _mm256_sub_epi8(qx0, _mm256_set1_epi8(1)); + qx1 = _mm256_sub_epi8(qx1, _mm256_set1_epi8(1)); + qx2 = _mm256_sub_epi8(qx2, _mm256_set1_epi8(1)); + qx3 = _mm256_sub_epi8(qx3, _mm256_set1_epi8(1)); + qx4 = _mm256_sub_epi8(qx4, _mm256_set1_epi8(1)); + + const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0)); + const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32)); + const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64)); + const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96)); + const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128)); + + // dot + qx0 = _mm256_sign_epi8(qy0, qx0); + qx1 = _mm256_sign_epi8(qy1, qx1); + qx2 = _mm256_sign_epi8(qy2, qx2); + qx3 = _mm256_sign_epi8(qy3, qx3); + qx4 = _mm256_sign_epi8(qy4, qx4); + + // widening addition + qx0 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx0); + qx1 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx1); + qx2 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx2); + qx3 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx3); + qx4 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx4); + + sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); + sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); + sumi2 = _mm256_add_epi16(sumi2, qx4); + } + + // last 16 bytes of 5-element, along with the 4 bytes of 4 elements + { + __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].q + 32)); + __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_broadcastd_epi32(_mm_loadu_si32((const void *) x[i].qs))); + __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3 + __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9 + __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9 + __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9 + __m256i qx01 = MM256_SET_M128I(qx1, qx0); + __m256i qx23 = MM256_SET_M128I(qx3, qx2); + + // avx2 does not have 8-bit multiplies, so 16-bit it is. + qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1)); + qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF)); + __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1)); + + __m256i qx45 = MM256_SET_M128I(qx5, qx4); + + // Cancel the +1 from avg so that it behaves like a halving add + qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1)); + qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1)); + qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1)); + // Multiply by 3 and get the top 2 bits + qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256())); + qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256())); + qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256())); + qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3)); + qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3)); + qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3)); + + // 0, 1, 2 => -1, 0, 1 + qx01 = _mm256_sub_epi8(qx01, _mm256_set1_epi8(1)); + qx23 = _mm256_sub_epi8(qx23, _mm256_set1_epi8(1)); + qx45 = _mm256_sub_epi8(qx45, _mm256_set1_epi8(1)); + + const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160)); + const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192)); + const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224)); + + // dot + qx01 = _mm256_sign_epi8(qy01, qx01); + qx23 = _mm256_sign_epi8(qy23, qx23); + qx45 = _mm256_sign_epi8(qy45, qx45); + + // widening addition + qx01 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx01); + qx23 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx23); + qx45 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx45); + + sumi0 = _mm256_add_epi16(sumi0, qx01); + sumi1 = _mm256_add_epi16(sumi1, qx23); + sumi2 = _mm256_add_epi16(sumi2, qx45); + } + + const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); + + sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2)); + sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); + + sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); + } + + *s = hsum_float_8(sumf); +// #elif defined __ARM_NEON +#else + const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; + + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int sum = 0; + + for (size_t j = 0; j < sizeof(x->q) - sizeof(x->q) % 32; j += 32) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = x[i].q[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*32 + m]; + } + } + } + for (size_t j = sizeof(x->q) - sizeof(x->q) % 32; j < sizeof(x->q); j += 16) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = x[i].q[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*16 + m]; + } + } + } + + for (size_t l = 0; l < 4; ++l) { + for (size_t j = 0; j < sizeof(x->qs); ++j) { + uint8_t q = x[i].qs[j] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[sizeof(x->q)*5 + l*sizeof(x->qs) + j]; + } + } + + sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d); + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq2_0 * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined __AVX2__ + __m256 sumf = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + // 16-bit sums, because 256*127 still fits + __m256i sumi0 = _mm256_setzero_si256(); + __m256i sumi1 = _mm256_setzero_si256(); + + for (size_t j = 0; j < sizeof(x->q); j += 32) { + __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].q + j)); + __m256i qx1 = _mm256_srli_epi16(qx0, 2); + __m256i qx2 = _mm256_srli_epi16(qx0, 4); + __m256i qx3 = _mm256_srli_epi16(qx0, 6); + + // 1, 2, 3 => -1, 0, 1 + qx0 = _mm256_sub_epi8(_mm256_and_si256(qx0, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); + qx1 = _mm256_sub_epi8(_mm256_and_si256(qx1, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); + qx2 = _mm256_sub_epi8(_mm256_and_si256(qx2, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); + qx3 = _mm256_sub_epi8(_mm256_and_si256(qx3, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); + + const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0)); + const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32)); + const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64)); + const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96)); + + qx0 = _mm256_sign_epi8(qy0, qx0); + qx1 = _mm256_sign_epi8(qy1, qx1); + qx2 = _mm256_sign_epi8(qy2, qx2); + qx3 = _mm256_sign_epi8(qy3, qx3); + + qx0 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx0); + qx1 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx1); + qx2 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx2); + qx3 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx3); + + sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); + sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); + } + const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); + + sumi0 = _mm256_add_epi16(sumi0, sumi1); + sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); + + sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); + } + + *s = hsum_float_8(sumf); +#else + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int32_t sumi = 0; + + for (size_t j = 0; j < sizeof(x->q); j += 32) { + for (size_t l = 0; l < 4; ++l) { + for (size_t k = 0; k < 32; ++k) { + sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].q[j + k] >> (l*2)) & 3) - 2); + } + } + } + + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + + sumf += (float) sumi * d; + } + + *s = sumf; +#endif +} + void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -15279,6 +15732,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte } } } break; + case GGML_TYPE_TQ1_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_tq1_0, data, nb); + } break; + case GGML_TYPE_TQ2_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb); + } break; case GGML_TYPE_IQ1_S: { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 0e30b306cb263..238cfd3fb7e42 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -28,6 +28,9 @@ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_REST void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k); +void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k); + void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); @@ -50,6 +53,9 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -73,6 +79,9 @@ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRI void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -98,6 +107,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -119,6 +131,9 @@ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT ds size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b652598f13515..5bbe0e4a89b4c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -863,7 +863,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q2_2, .from_float = quantize_row_q2_2, - .from_float_reference = (ggml_from_float_t) quantize_row_q2_2_reference, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_2_ref, .vec_dot = ggml_vec_dot_q2_2_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, @@ -875,7 +875,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q1_3, .from_float = quantize_row_q1_3, - .from_float_reference = (ggml_from_float_t) quantize_row_q1_3_reference, + .from_float_ref = (ggml_from_float_t) quantize_row_q1_3_ref, .vec_dot = ggml_vec_dot_q1_3_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, @@ -994,7 +994,31 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .ncols = 8, .gemv = ggml_gemv_q4_0_8x8_q8_0, .gemm = ggml_gemm_q4_0_8x8_q8_0, - } + }, + [GGML_TYPE_TQ1_0] = { + .type_name = "tq1_0", + .blck_size = QK_K, + .type_size = sizeof(block_tq1_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_tq1_0, + .from_float = quantize_row_tq1_0, + .from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref, + .vec_dot = ggml_vec_dot_tq1_0_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_TQ2_0] = { + .type_name = "tq2_0", + .blck_size = QK_K, + .type_size = sizeof(block_tq2_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_tq2_0, + .from_float = quantize_row_tq2_0, + .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, + .vec_dot = ggml_vec_dot_tq2_0_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, }; // For internal test use @@ -13332,6 +13356,8 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -13923,6 +13949,8 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -20622,6 +20650,8 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f87afd77d5e42..32e73e56aa424 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1145,8 +1145,13 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 - Q2_2 = 31 - Q1_3 = 32 + Q4_0_4_4 = 31 + Q4_0_4_8 = 32 + Q4_0_8_8 = 33 + TQ1_0 = 34 + TQ2_0 = 35 + Q1_3 = 36 + Q2_2 = 37 # TODO: add GGMLFileType from ggml_ftype in ggml.h diff --git a/include/llama.h b/include/llama.h index 2a3f2c913a805..7dcc260e8faba 100644 --- a/include/llama.h +++ b/include/llama.h @@ -166,8 +166,10 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q2_2 = 36, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q1_3 = 37, // except 1d tensors + LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors + LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q1_3 = 38, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_2 = 39, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 7a33ac8527312..7ab2b47cd6ee5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3771,6 +3771,8 @@ struct llama_model_loader { case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; + case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; + case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; @@ -4466,6 +4468,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; + case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary"; + case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; @@ -15344,6 +15348,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { + new_type = GGML_TYPE_Q4_K; + } } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { @@ -15647,6 +15654,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_K_S: case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; + case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break; + case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index b3d07d6d186ca..a6e508d017d16 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -15,13 +15,13 @@ constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; -constexpr float MAX_QUANTIZATION_TOTAL_ERROR_BITNET = 0.015625f; +constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.015625f; // TODO: change to 0.01f constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f; constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f; constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f; -constexpr float MAX_DOT_PRODUCT_ERROR_BITNET = 0.5f; +constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.5f; // TODO: change to 0.15f static const char* RESULT_STR[] = {"ok", "FAILED"}; @@ -146,8 +146,10 @@ int main(int argc, char * argv[]) { if (qfns.from_float && qfns.to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = - type == GGML_TYPE_Q1_3 ? MAX_QUANTIZATION_TOTAL_ERROR_BITNET : - type == GGML_TYPE_Q2_2 ? MAX_QUANTIZATION_TOTAL_ERROR_BITNET : + type == GGML_TYPE_Q1_3 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : + type == GGML_TYPE_Q2_2 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : + type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : + type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : @@ -170,8 +172,8 @@ int main(int argc, char * argv[]) { const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT - : type == GGML_TYPE_Q2_2 || type == GGML_TYPE_Q1_3 - ? MAX_DOT_PRODUCT_ERROR_BITNET + : type == GGML_TYPE_Q2_2 || type == GGML_TYPE_Q1_3 || type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 + ? MAX_DOT_PRODUCT_ERROR_TERNARY : MAX_DOT_PRODUCT_ERROR; failed = !(vec_dot_error < max_allowed_error); num_failed += failed; From 560873f337f4e35a1fafc5986e750687ebb32cdf Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Tue, 30 Jul 2024 23:36:52 -0400 Subject: [PATCH 15/28] ggml : even faster TQ2_0 --- ggml/src/ggml-quants.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 44506f731a607..eb71aa9aa9e25 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3456,8 +3456,8 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, for (size_t m = 0; m < 32; ++m) { uint8_t q = 0; for (size_t n = 0; n < 4; ++n) { - // -1, 0, 1 -> 1, 2, 3 - int xi = nearest_int(x[m + n*32] * id) + 2; + // -1, 0, 1 -> 0, 1, 2 + int xi = nearest_int(x[m + n*32] * id) + 1; q += (xi & 3) << (2*n); } y[i].q[j + m] = q; @@ -3544,7 +3544,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in for (size_t j = 0; j < sizeof(x->q); j += 32) { for (size_t l = 0; l < 4; ++l) { for (size_t m = 0; m < 32; ++m) { - *y++ = (float) (((x[i].q[j + m] >> (l*2)) & 3) - 2) * d; + *y++ = (float) (((x[i].q[j + m] >> (l*2)) & 3) - 1) * d; } } } @@ -6127,33 +6127,31 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * __m256i qx2 = _mm256_srli_epi16(qx0, 4); __m256i qx3 = _mm256_srli_epi16(qx0, 6); - // 1, 2, 3 => -1, 0, 1 - qx0 = _mm256_sub_epi8(_mm256_and_si256(qx0, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); - qx1 = _mm256_sub_epi8(_mm256_and_si256(qx1, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); - qx2 = _mm256_sub_epi8(_mm256_and_si256(qx2, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); - qx3 = _mm256_sub_epi8(_mm256_and_si256(qx3, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); + // 0, 1, 2 (should not be 3) + qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3)); + qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3)); + qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3)); + qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3)); const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0)); const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32)); const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64)); const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96)); - qx0 = _mm256_sign_epi8(qy0, qx0); - qx1 = _mm256_sign_epi8(qy1, qx1); - qx2 = _mm256_sign_epi8(qy2, qx2); - qx3 = _mm256_sign_epi8(qy3, qx3); - - qx0 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx0); - qx1 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx1); - qx2 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx2); - qx3 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx3); + qx0 = _mm256_maddubs_epi16(qx0, qy0); + qx1 = _mm256_maddubs_epi16(qx1, qy1); + qx2 = _mm256_maddubs_epi16(qx2, qy2); + qx3 = _mm256_maddubs_epi16(qx3, qy3); sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); } + + const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); sumi0 = _mm256_add_epi16(sumi0, sumi1); + sumi0 = _mm256_sub_epi16(sumi0, ysum); sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); @@ -6169,7 +6167,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * for (size_t j = 0; j < sizeof(x->q); j += 32) { for (size_t l = 0; l < 4; ++l) { for (size_t k = 0; k < 32; ++k) { - sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].q[j + k] >> (l*2)) & 3) - 2); + sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].q[j + k] >> (l*2)) & 3) - 1); } } } From e9719576c48aeeb0407198e3d2d5d9ed36bf379e Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 31 Jul 2024 00:06:21 -0400 Subject: [PATCH 16/28] ggml : also faster TQ1_0 Same optimization as for TQ2_0 by offsetting the sum instead of the weights. This makes TQ1_0 almost as fast as Q8_0 on AVX2. --- ggml/src/ggml-quants.c | 44 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index eb71aa9aa9e25..9f6d91ed5055c 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5963,32 +5963,17 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3)); qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3)); - // 0, 1, 2 => -1, 0, 1 - qx0 = _mm256_sub_epi8(qx0, _mm256_set1_epi8(1)); - qx1 = _mm256_sub_epi8(qx1, _mm256_set1_epi8(1)); - qx2 = _mm256_sub_epi8(qx2, _mm256_set1_epi8(1)); - qx3 = _mm256_sub_epi8(qx3, _mm256_set1_epi8(1)); - qx4 = _mm256_sub_epi8(qx4, _mm256_set1_epi8(1)); - const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0)); const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32)); const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64)); const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96)); const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128)); - // dot - qx0 = _mm256_sign_epi8(qy0, qx0); - qx1 = _mm256_sign_epi8(qy1, qx1); - qx2 = _mm256_sign_epi8(qy2, qx2); - qx3 = _mm256_sign_epi8(qy3, qx3); - qx4 = _mm256_sign_epi8(qy4, qx4); - - // widening addition - qx0 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx0); - qx1 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx1); - qx2 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx2); - qx3 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx3); - qx4 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx4); + qx0 = _mm256_maddubs_epi16(qx0, qy0); + qx1 = _mm256_maddubs_epi16(qx1, qy1); + qx2 = _mm256_maddubs_epi16(qx2, qy2); + qx3 = _mm256_maddubs_epi16(qx3, qy3); + qx4 = _mm256_maddubs_epi16(qx4, qy4); sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); @@ -6025,32 +6010,23 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3)); qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3)); - // 0, 1, 2 => -1, 0, 1 - qx01 = _mm256_sub_epi8(qx01, _mm256_set1_epi8(1)); - qx23 = _mm256_sub_epi8(qx23, _mm256_set1_epi8(1)); - qx45 = _mm256_sub_epi8(qx45, _mm256_set1_epi8(1)); - const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160)); const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192)); const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224)); - // dot - qx01 = _mm256_sign_epi8(qy01, qx01); - qx23 = _mm256_sign_epi8(qy23, qx23); - qx45 = _mm256_sign_epi8(qy45, qx45); - - // widening addition - qx01 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx01); - qx23 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx23); - qx45 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx45); + qx01 = _mm256_maddubs_epi16(qx01, qy01); + qx23 = _mm256_maddubs_epi16(qx23, qy23); + qx45 = _mm256_maddubs_epi16(qx45, qy45); sumi0 = _mm256_add_epi16(sumi0, qx01); sumi1 = _mm256_add_epi16(sumi1, qx23); sumi2 = _mm256_add_epi16(sumi2, qx45); } + const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); + sumi0 = _mm256_sub_epi16(sumi0, ysum); sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2)); sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); From a6dd6994a585ce450f6d55901edcf76d5cc5ab75 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 31 Jul 2024 23:14:36 -0400 Subject: [PATCH 17/28] ggml : fix build issues in certain environments --- ggml/src/ggml-quants.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 9f6d91ed5055c..ba05996c8574a 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3508,7 +3508,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in for (size_t n = 0; n < 5; ++n) { for (size_t m = 0; m < 32; ++m) { uint8_t q = x[i].q[j + m] * pow3[n]; - uint16_t xi = ((uint16_t) q * 3) >> 8; + int16_t xi = ((uint16_t) q * 3) >> 8; *y++ = (float) (xi - 1) * d; } } @@ -3517,7 +3517,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in for (size_t n = 0; n < 5; ++n) { for (size_t m = 0; m < 16; ++m) { uint8_t q = x[i].q[j + m] * pow3[n]; - uint16_t xi = ((uint16_t) q * 3) >> 8; + int16_t xi = ((uint16_t) q * 3) >> 8; *y++ = (float) (xi - 1) * d; } } @@ -3526,7 +3526,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in for (size_t n = 0; n < 4; ++n) { for (size_t j = 0; j < sizeof(x->qs); ++j) { uint8_t q = x[i].qs[j] * pow3[n]; - uint16_t xi = ((uint16_t) q * 3) >> 8; + int16_t xi = ((uint16_t) q * 3) >> 8; *y++ = (float) (xi - 1) * d; } } @@ -3544,7 +3544,8 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in for (size_t j = 0; j < sizeof(x->q); j += 32) { for (size_t l = 0; l < 4; ++l) { for (size_t m = 0; m < 32; ++m) { - *y++ = (float) (((x[i].q[j + m] >> (l*2)) & 3) - 1) * d; + int8_t q = (x[i].q[j + m] >> (l*2)) & 3; + *y++ = (float) (q - 1) * d; } } } @@ -3621,7 +3622,8 @@ void dequantize_row_q1_3(const block_q1_3 * restrict x, float * restrict y, int6 for (size_t j = 0; j < sizeof(x->q); ++j) { uint16_t q = x[i].q[j]; - *y++ = (float) ((int16_t)((q * 3) >> 8) - 1); + int16_t qi = (q * 3) >> 8; + *y++ = (float) (qi - 1); } for (size_t j = 0; j < sizeof(x->qs); ++j) { @@ -5983,7 +5985,9 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * // last 16 bytes of 5-element, along with the 4 bytes of 4 elements { __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].q + 32)); - __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_broadcastd_epi32(_mm_loadu_si32((const void *) x[i].qs))); + uint32_t qs; + memcpy(&qs, x[i].qs, sizeof(qs)); // potentially unaligned + __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qs)); __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3 __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9 __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9 From 5417089aeb941cfa4104e91eba539f5987ccd6fa Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 31 Jul 2024 23:35:04 -0400 Subject: [PATCH 18/28] ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0 --- ggml/src/ggml-quants.c | 198 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 195 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index ba05996c8574a..b18df4b668bad 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5927,7 +5927,130 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * const int nb = n / QK_K; -#if defined __AVX2__ +#if defined __ARM_NEON + float sumf = 0.0f; + + uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27}; + + const uint8x16_t shift = vld1q_u8(k_shift); + + for (int i = 0; i < nb; ++i) { + int16x8_t sumi0 = vdupq_n_s16(0); + int16x8_t sumi1 = vdupq_n_s16(0); + + // first 32 bytes of 5 elements + { + uint8x16_t qx0 = vld1q_u8(x[i].q + 0); + uint8x16_t qx1 = vld1q_u8(x[i].q + 16); + uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3)); + uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3)); + uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9)); + uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9)); + uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27)); + uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27)); + uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81)); + uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81)); + + // multiply by 3 and keep the 2 bits above 8 bits + int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); + int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); + int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); + int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); + int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); + int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); + int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6)); + int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6)); + int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6)); + int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6)); + + const int8x16_t qy0 = vld1q_s8(y[i].qs + 0); + const int8x16_t qy1 = vld1q_s8(y[i].qs + 16); + const int8x16_t qy2 = vld1q_s8(y[i].qs + 32); + const int8x16_t qy3 = vld1q_s8(y[i].qs + 48); + const int8x16_t qy4 = vld1q_s8(y[i].qs + 64); + const int8x16_t qy5 = vld1q_s8(y[i].qs + 80); + const int8x16_t qy6 = vld1q_s8(y[i].qs + 96); + const int8x16_t qy7 = vld1q_s8(y[i].qs + 112); + const int8x16_t qy8 = vld1q_s8(y[i].qs + 128); + const int8x16_t qy9 = vld1q_s8(y[i].qs + 144); + + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); + sumi1 = vmlal_high_s8(sumi1, sqx0, qy0); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); + sumi1 = vmlal_high_s8(sumi1, sqx1, qy1); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); + sumi1 = vmlal_high_s8(sumi1, sqx2, qy2); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); + sumi1 = vmlal_high_s8(sumi1, sqx3, qy3); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); + sumi1 = vmlal_high_s8(sumi1, sqx4, qy4); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); + sumi1 = vmlal_high_s8(sumi1, sqx5, qy5); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6)); + sumi1 = vmlal_high_s8(sumi1, sqx6, qy6); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7)); + sumi1 = vmlal_high_s8(sumi1, sqx7, qy7); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8)); + sumi1 = vmlal_high_s8(sumi1, sqx8, qy8); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9)); + sumi1 = vmlal_high_s8(sumi1, sqx9, qy9); + } + + // last 16 bytes of 5-element, along with the 4 bytes of 4 elements + { + uint8x16_t qx0 = vld1q_u8(x[i].q + 32); + uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3)); + uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9)); + uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27)); + uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81)); + uint32_t qs; + memcpy(&qs, x[i].qs, sizeof(qs)); // potentially unaligned + uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qs)); + qx5 = vmulq_u8(qx5, shift); + + // multiply by 3 and keep the 2 bits above 8 bits + int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); + int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); + int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); + int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); + int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); + int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); + + const int8x16_t qy0 = vld1q_s8(y[i].qs + 160); + const int8x16_t qy1 = vld1q_s8(y[i].qs + 176); + const int8x16_t qy2 = vld1q_s8(y[i].qs + 192); + const int8x16_t qy3 = vld1q_s8(y[i].qs + 208); + const int8x16_t qy4 = vld1q_s8(y[i].qs + 224); + const int8x16_t qy5 = vld1q_s8(y[i].qs + 240); + + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); + sumi1 = vmlal_high_s8(sumi1, sqx0, qy0); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); + sumi1 = vmlal_high_s8(sumi1, sqx1, qy1); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); + sumi1 = vmlal_high_s8(sumi1, sqx2, qy2); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); + sumi1 = vmlal_high_s8(sumi1, sqx3, qy3); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); + sumi1 = vmlal_high_s8(sumi1, sqx4, qy4); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); + sumi1 = vmlal_high_s8(sumi1, sqx5, qy5); + } + + const int16x8_t ysum0 = vld1q_s16(y[i].bsums); + const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); + + sumi0 = vaddq_s16(sumi0, sumi1); + sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1)); + + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + + sumf += d * (float) vaddlvq_s16(sumi0); + } + + *s = sumf; + +#elif defined __AVX2__ __m256 sumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { @@ -6038,7 +6161,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * } *s = hsum_float_8(sumf); -// #elif defined __ARM_NEON + #else const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; @@ -6093,7 +6216,75 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * const int nb = n / QK_K; -#if defined __AVX2__ +#if defined __ARM_NEON + float sumf = 0.0f; + + const uint8x16_t m3 = vdupq_n_u8(3); + + for (int i = 0; i < nb; ++i) { + int16x8_t sumi0 = vdupq_n_s16(0); + int16x8_t sumi1 = vdupq_n_s16(0); + + for (size_t j = 0; j < sizeof(x->q); j += 32) { + uint8x16_t qx0 = vld1q_u8(x[i].q + j); + uint8x16_t qx1 = vld1q_u8(x[i].q + j + 16); + uint8x16_t qx2 = vshrq_n_u8(qx0, 2); + uint8x16_t qx3 = vshrq_n_u8(qx1, 2); + uint8x16_t qx4 = vshrq_n_u8(qx0, 4); + uint8x16_t qx5 = vshrq_n_u8(qx1, 4); + uint8x16_t qx6 = vshrq_n_u8(qx0, 6); + uint8x16_t qx7 = vshrq_n_u8(qx1, 6); + + int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3)); + int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3)); + int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3)); + int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3)); + int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3)); + int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3)); + int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3)); + int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3)); + + const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 + 0); + const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 + 16); + const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 + 32); + const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 + 48); + const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 + 64); + const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 + 80); + const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 + 96); + const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112); + + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); + sumi1 = vmlal_high_s8(sumi1, sqx0, qy0); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); + sumi1 = vmlal_high_s8(sumi1, sqx1, qy1); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); + sumi1 = vmlal_high_s8(sumi1, sqx2, qy2); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); + sumi1 = vmlal_high_s8(sumi1, sqx3, qy3); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); + sumi1 = vmlal_high_s8(sumi1, sqx4, qy4); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); + sumi1 = vmlal_high_s8(sumi1, sqx5, qy5); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6)); + sumi1 = vmlal_high_s8(sumi1, sqx6, qy6); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7)); + sumi1 = vmlal_high_s8(sumi1, sqx7, qy7); + } + + const int16x8_t ysum0 = vld1q_s16(y[i].bsums); + const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); + + sumi0 = vaddq_s16(sumi0, sumi1); + sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1)); + + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + + sumf += d * (float) vaddlvq_s16(sumi0); + } + + *s = sumf; + +#elif defined __AVX2__ __m256 sumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { @@ -6138,6 +6329,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * } *s = hsum_float_8(sumf); + #else float sumf = 0.0f; From 45719a2472dd43bc3ba43d27d61fec34c6c14cb2 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 1 Aug 2024 01:11:30 -0400 Subject: [PATCH 19/28] ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat The compiler seems smart enough to use the same instruction even when using vget_high_s8 instead. --- ggml/src/ggml-quants.c | 48 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index b18df4b668bad..e66d9be25dac5 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5975,25 +5975,25 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * const int8x16_t qy9 = vld1q_s8(y[i].qs + 144); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); - sumi1 = vmlal_high_s8(sumi1, sqx0, qy0); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); - sumi1 = vmlal_high_s8(sumi1, sqx1, qy1); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); - sumi1 = vmlal_high_s8(sumi1, sqx2, qy2); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); - sumi1 = vmlal_high_s8(sumi1, sqx3, qy3); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); - sumi1 = vmlal_high_s8(sumi1, sqx4, qy4); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); - sumi1 = vmlal_high_s8(sumi1, sqx5, qy5); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6)); - sumi1 = vmlal_high_s8(sumi1, sqx6, qy6); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7)); - sumi1 = vmlal_high_s8(sumi1, sqx7, qy7); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8)); - sumi1 = vmlal_high_s8(sumi1, sqx8, qy8); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9)); - sumi1 = vmlal_high_s8(sumi1, sqx9, qy9); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9)); } // last 16 bytes of 5-element, along with the 4 bytes of 4 elements @@ -6024,17 +6024,17 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * const int8x16_t qy5 = vld1q_s8(y[i].qs + 240); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); - sumi1 = vmlal_high_s8(sumi1, sqx0, qy0); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); - sumi1 = vmlal_high_s8(sumi1, sqx1, qy1); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); - sumi1 = vmlal_high_s8(sumi1, sqx2, qy2); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); - sumi1 = vmlal_high_s8(sumi1, sqx3, qy3); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); - sumi1 = vmlal_high_s8(sumi1, sqx4, qy4); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); - sumi1 = vmlal_high_s8(sumi1, sqx5, qy5); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); } const int16x8_t ysum0 = vld1q_s16(y[i].bsums); @@ -6254,21 +6254,21 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); - sumi1 = vmlal_high_s8(sumi1, sqx0, qy0); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); - sumi1 = vmlal_high_s8(sumi1, sqx1, qy1); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); - sumi1 = vmlal_high_s8(sumi1, sqx2, qy2); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); - sumi1 = vmlal_high_s8(sumi1, sqx3, qy3); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); - sumi1 = vmlal_high_s8(sumi1, sqx4, qy4); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); - sumi1 = vmlal_high_s8(sumi1, sqx5, qy5); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6)); - sumi1 = vmlal_high_s8(sumi1, sqx6, qy6); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7)); - sumi1 = vmlal_high_s8(sumi1, sqx7, qy7); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7)); } const int16x8_t ysum0 = vld1q_s16(y[i].bsums); From 04eec5811274df212a0f928929a262de1168a378 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 2 Aug 2024 19:52:19 -0400 Subject: [PATCH 20/28] ggml : remove q1_3 and q2_2 * llama : remove the separate scale tensors of BitNet b1.58 They won't be needed, since the remaining ternary quant types have built-in scales. --- convert_hf_to_gguf.py | 50 +--- examples/quantize/quantize.cpp | 2 - ggml/include/ggml.h | 2 - ggml/src/ggml-common.h | 49 ---- ggml/src/ggml-quants.c | 481 +-------------------------------- ggml/src/ggml-quants.h | 10 - ggml/src/ggml.c | 28 -- gguf-py/gguf/constants.py | 16 +- gguf-py/gguf/quants.py | 52 ---- include/llama.h | 2 - src/llama.cpp | 38 +-- tests/test-quantize-fns.cpp | 8 +- 12 files changed, 45 insertions(+), 693 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e93c3829acad8..a137c157e2bc8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -284,9 +284,6 @@ def prepare_tensors(self): for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): data: np.ndarray # type hint - if len(data.shape) == 0: - # otherwise single-value tensors get squeezed - data = data.reshape((1,)) n_dims = len(data.shape) data_dtype = data.dtype data_qtype: gguf.GGMLQuantizationType | None = None @@ -317,33 +314,12 @@ def prepare_tensors(self): )) if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: - # TODO: cleaner model-specific per-tensor types - # NOTE: Q1_3 is only relevant for BitNet b1.58 - if ( - self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 - and gguf.can_quantize_to_q1_3(data) - and not any( - self.match_model_tensor_name(new_name, key, None) - for key in [ - gguf.MODEL_TENSOR.TOKEN_EMBD, - gguf.MODEL_TENSOR.OUTPUT, - ] - ) - ): - data = gguf.quantize_q1_3(data) - assert data.dtype == np.uint8 - data_qtype = gguf.GGMLQuantizationType.Q1_3 - - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: data = gguf.quantize_bf16(data) assert data.dtype == np.int16 data_qtype = gguf.GGMLQuantizationType.BF16 - elif ( - self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 - or self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 - and gguf.can_quantize_to_q8_0(data) - ): + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): data = gguf.quantize_q8_0(data) assert data.dtype == np.uint8 data_qtype = gguf.GGMLQuantizationType.Q8_0 @@ -1635,12 +1611,6 @@ def prepare_tensors(self): class BitnetModel(Model): model_arch = gguf.MODEL_ARCH.BITNET - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, *args, **kwargs): - if ftype == gguf.LlamaFileType.GUESSED: - ftype = gguf.LlamaFileType.MOSTLY_Q1_3 - - super().__init__(dir_model, ftype, *args, **kwargs) - def set_vocab(self): self._set_vocab_sentencepiece() @@ -1649,16 +1619,16 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) - def weight_quant(self, weight): + def weight_quant(self, weight: Tensor) -> Tensor: dtype = weight.dtype weight = weight.float() scale = weight.abs().mean().clamp(min=1e-5) iscale = 1 / scale - weight = (weight * iscale).round().clamp(-1, 1) - # TODO: use the scale directly instead of inverting it twice + # TODO: multiply by the scale directly instead of inverting it twice # (this is also unnecessarily doubly inverted upstream) # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 - return weight.type(dtype), (1 / iscale).type(torch.float32) + result = (weight * iscale).round().clamp(-1, 1) / iscale + return result.type(dtype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) @@ -1673,11 +1643,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter gguf.MODEL_TENSOR.FFN_GATE, ]): # transform weight into 1/0/-1 (in fp32) - weight_torch, scale_torch = self.weight_quant(data_torch) - yield (new_name, weight_torch) - yield (new_name.removesuffix(".weight") + ".scale", scale_torch) - else: - yield (new_name, data_torch) + data_torch = self.weight_quant(data_torch) + + yield (new_name, data_torch) @Model.register("GrokForCausalLM") diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1086cc9ed086f..7f8f724b79178 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -28,8 +28,6 @@ static const std::vector QUANT_OPTIONS = { { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", }, { "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", }, - { "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet b1.58", }, - { "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet b1.58", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 03884cba4df51..3950a4a0750d5 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -392,8 +392,6 @@ extern "C" { GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, - GGML_TYPE_Q2_2 = 36, - GGML_TYPE_Q1_3 = 37, GGML_TYPE_COUNT, }; diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 3be4dd4ca783d..c65614696fc54 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -141,20 +141,6 @@ typedef sycl::half2 ggml_half2; #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP -// 1.625 bpw for BitNet b1.58 models -#define QK1_3 64 -typedef struct { - uint8_t q[(QK1_3 - 4*QK1_3/64)/5]; // 5 elements per byte (3^5 = 243 < 256) - uint8_t qs[QK1_3/64]; // 4 elements per byte -} block_q1_3; -static_assert(sizeof(block_q1_3) == (QK1_3 - 4*QK1_3/64)/5 + QK1_3/64, "wrong q1_3 block size/padding"); - -#define QK2_2 32 -typedef struct { - uint8_t qs[QK2_2 / 4]; // nibbles / quants -} block_q2_2; -static_assert(sizeof(block_q2_2) == QK2_2 / 4, "wrong q2_2 block size/padding"); - #define QK4_0 32 typedef struct { ggml_half d; // delta @@ -1084,41 +1070,6 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512) 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, GGML_TABLE_END() -GGML_TABLE_BEGIN(uint32_t, q1_3_grid, 256) - 0xffffffff, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff, - 0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff000000, 0xff000001, - 0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff, - 0xff010000, 0xff010001, 0xff0101ff, 0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01, - 0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00, - 0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101, - 0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000, 0x00010001, 0x000101ff, 0x00010100, - 0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001, - 0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000, - 0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x0101ff01, - 0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00, - 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff, - 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100, - 0xff000101, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff, - 0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0000, - 0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff, - 0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01, - 0x000100ff, 0x00010000, 0x00010000, 0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff, - 0x01ffff00, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101, - 0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x01000001, 0x010001ff, - 0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001, - 0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, - 0xffff0001, 0xffff01ff, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, - 0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ff00, - 0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff, 0xff0101ff, 0xff010100, 0xff010101, - 0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100, - 0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff, - 0x00000100, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000, - 0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ff00ff, - 0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x01ff0101, 0x0100ffff, 0x0100ff00, - 0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff, - 0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101, -GGML_TABLE_END() - #define NGRID_IQ1S 2048 #define IQ1S_DELTA 0.125f #define IQ1M_DELTA 0.125f diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index e66d9be25dac5..a2fd0563c8cfe 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -657,39 +657,6 @@ static inline __m128i packNibbles( __m256i bytes ) { } #endif //__loongarch_asx -void quantize_row_q2_2_ref(const float * restrict x, block_q2_2 * restrict y, int64_t k) { - static const int qk = QK2_2; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - - for (int j = 0; j < qk/4; ++j) { - int8_t x0 = (int8_t)x[i*qk + 0 + j]; - int8_t x1 = (int8_t)x[i*qk + 1*qk/4 + j]; - int8_t x2 = (int8_t)x[i*qk + 2*qk/4 + j]; - int8_t x3 = (int8_t)x[i*qk + 3*qk/4 + j]; - - const uint8_t xi0 = x0 < 0 ? 1 : x0 == 0 ? 2 : 3; - const uint8_t xi1 = x1 < 0 ? 1 : x1 == 0 ? 2 : 3; - const uint8_t xi2 = x2 < 0 ? 1 : x2 == 0 ? 2 : 3; - const uint8_t xi3 = x3 < 0 ? 1 : x3 == 0 ? 2 : 3; - - y[i].qs[j] = 0; - y[i].qs[j] |= (xi0 << 0); - y[i].qs[j] |= (xi1 << 2); - y[i].qs[j] |= (xi2 << 4); - y[i].qs[j] |= (xi3 << 6); - } - } -} - -void quantize_row_q2_2(const float * restrict x, void * restrict y, int64_t k) { - quantize_row_q2_2_ref(x, y, k); -} - // reference implementation for deterministic creation of model files void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) { static const int qk = QK4_0; @@ -1545,26 +1512,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) #endif } -void dequantize_row_q2_2(const block_q2_2 * restrict x, float * restrict y, int64_t k) { - static const int qk = QK2_2; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - - for (int j = 0; j < qk/4; ++j) { - const int8_t q = x[i].qs[j]; - - y[i*qk + j + 0 ] = (float) (((q >> 0) & 3) - 2); - y[i*qk + j + 1*qk/4] = (float) (((q >> 2) & 3) - 2); - y[i*qk + j + 2*qk/4] = (float) (((q >> 4) & 3) - 2); - y[i*qk + j + 3*qk/4] = (float) (((q >> 6) & 3) - 2); - } - } -} - void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) { static const int qk = QK4_0; @@ -3359,13 +3306,6 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -size_t quantize_q2_2(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - (void)quant_weights; // not used - const size_t row_size = ggml_row_size(GGML_TYPE_Q2_2, n_per_row); - quantize_row_q2_2_ref(src, dst, (int64_t)nrow*n_per_row); - return nrow * row_size; -} - // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs) void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) { @@ -3552,89 +3492,6 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in } } -void quantize_row_q1_3_ref(const float * restrict x, block_q1_3 * restrict y, int64_t k) { - assert(k % QK1_3 == 0); - const int64_t nb = k / QK1_3; - static_assert(sizeof(y->q) % 4 == 0, "bad block_q1_3.q size"); - - const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; - - for (int64_t i = 0; i < nb; ++i) { - uint8_t q[sizeof(y->q)] = {0}; - for (size_t j = 0; j < sizeof(y->q); ++j) { - for (size_t m = 0; m < 4; ++m) { - int xi = nearest_int(x[m]); - uint8_t xt = xi < 0 ? 0 : xi == 0 ? 1 : 2; - q[j] += xt * pow3[m]; - } - x += 4; - } - for (size_t j = 0; j < sizeof(y->q); ++j) { - int xi = nearest_int(x[j]); - uint8_t xt = xi < 0 ? 0 : xi == 0 ? 1 : 2; - q[j] += xt * pow3[4]; - // ceiling division - q[j] = ((uint16_t)q[j] * 256 + (pow3[5] - 1)) / pow3[5]; - y[i].q[j] = q[j]; - } - x += sizeof(y->q); - - for (size_t j = 0; j < sizeof(y->qs); ++j) { - uint8_t qb = 0; - for (size_t m = 0; m < 4; ++m) { - int xi = nearest_int(x[m]); - uint8_t xt = xi < 0 ? 0 : xi == 0 ? 1 : 2; - qb += xt * pow3[m]; - } - x += 4; - // ceiling division - qb = ((uint16_t)qb * 256 + (pow3[5] - 1)) / pow3[5]; - y[i].qs[j] = qb; - } - } -} - -void quantize_row_q1_3(const float * restrict x, void * restrict vy, int64_t k) { - assert(k % QK1_3 == 0); - block_q1_3 * restrict y = vy; - quantize_row_q1_3_ref(x, y, k); -} - -size_t quantize_q1_3(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - (void)quant_weights; // not used - const size_t row_size = ggml_row_size(GGML_TYPE_Q1_3, n_per_row); - quantize_row_q1_3(src, dst, (int64_t)nrow*n_per_row); - return nrow * row_size; -} - -void dequantize_row_q1_3(const block_q1_3 * restrict x, float * restrict y, int64_t k) { - assert(k % QK1_3 == 0); - const int64_t nb = k / QK1_3; - static_assert(sizeof(x->q) % 4 == 0, "bad block_q1_3.q size"); - - for (int64_t i = 0; i < nb; ++i) { - for (size_t j = 0; j < sizeof(x->q); ++j) { - const int8_t * q = (const int8_t *) (q1_3_grid + x[i].q[j]); - for (int m = 0; m < 4; ++m) { - *y++ = (float) q[m]; - } - } - - for (size_t j = 0; j < sizeof(x->q); ++j) { - uint16_t q = x[i].q[j]; - int16_t qi = (q * 3) >> 8; - *y++ = (float) (qi - 1); - } - - for (size_t j = 0; j < sizeof(x->qs); ++j) { - const int8_t * q = (const int8_t *) (q1_3_grid + x[i].qs[j]); - for (int m = 0; m < 4; ++m) { - *y++ = (float) q[m]; - } - } - } -} - // ====================== "True" 2-bit (de)-quantization void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) { @@ -4055,122 +3912,6 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q2_2 * restrict x = vx; - const block_q8_0 * restrict y = vy; - -#if defined(__AVX2__) - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i].d) ); - - // assuming this is always aligned - __m256i xq8 = _mm256_set1_epi64x(*(const int64_t *) x[i].qs); - xq8 = _mm256_srlv_epi64(xq8, _mm256_set_epi64x(6, 4, 2, 0)); - xq8 = _mm256_and_si256(xq8, _mm256_set1_epi8(0x03)); - // stangely enough, this is much slower with 1 instead of 2 - xq8 = _mm256_sub_epi8(xq8, _mm256_set1_epi8(2)); - - const __m256i yq8 = _mm256_loadu_si256((const __m256i *) (y[i].qs)); - const __m256 q = mul_sum_i8_pairs_float(xq8, yq8); - - acc = _mm256_fmadd_ps( d, q, acc ); - } - - *s = hsum_float_8(acc); -#elif defined(__ARM_NEON) - float sumf0 = 0.0f; - float sumf1 = 0.0f; - - const uint8x8_t mask = vdup_n_u8(3); - const int8x8_t offset = vdup_n_s8(2); - - const int leftovers = nb % 2; - - for (int i = 0; i < nb - leftovers; i += 2) { - const uint8x8_t xq8_0 = vld1_u8(x[0].qs); - const uint8x8_t xq8_1 = vld1_u8(x[1].qs); - - const int8x8_t xq8_0_0 = vsub_s8(vreinterpret_s8_u8(vand_u8(xq8_0, mask)), offset); - const int8x8_t xq8_0_1 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_0, 2), mask)), offset); - const int8x8_t xq8_0_2 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_0, 4), mask)), offset); - const int8x8_t xq8_0_3 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_0, 6), mask)), offset); - const int8x8_t xq8_1_0 = vsub_s8(vreinterpret_s8_u8(vand_u8(xq8_1, mask)), offset); - const int8x8_t xq8_1_1 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_1, 2), mask)), offset); - const int8x8_t xq8_1_2 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_1, 4), mask)), offset); - const int8x8_t xq8_1_3 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8_1, 6), mask)), offset); - - const int8x16_t xq8_0_l = vcombine_s8(xq8_0_0, xq8_0_1); - const int8x16_t xq8_0_h = vcombine_s8(xq8_0_2, xq8_0_3); - const int8x16_t xq8_1_l = vcombine_s8(xq8_1_0, xq8_1_1); - const int8x16_t xq8_1_h = vcombine_s8(xq8_1_2, xq8_1_3); - - const int8x16_t yq8_0_l = vld1q_s8(y[0].qs + 0); - const int8x16_t yq8_0_h = vld1q_s8(y[0].qs + 16); - const int8x16_t yq8_1_l = vld1q_s8(y[1].qs + 0); - const int8x16_t yq8_1_h = vld1q_s8(y[1].qs + 16); - - const int16x8_t dot0 = vaddq_s16(vpaddlq_s8(vmulq_s8(xq8_0_l, yq8_0_l)), vpaddlq_s8(vmulq_s8(xq8_0_h, yq8_0_h))); - const int16x8_t dot1 = vaddq_s16(vpaddlq_s8(vmulq_s8(xq8_1_l, yq8_1_l)), vpaddlq_s8(vmulq_s8(xq8_1_h, yq8_1_h))); - - sumf0 += GGML_FP16_TO_FP32(y[0].d) * (float) vaddlvq_s16(dot0); - sumf1 += GGML_FP16_TO_FP32(y[1].d) * (float) vaddlvq_s16(dot1); - x += 2; - y += 2; - } - - // one block at a time - for (int i = nb - leftovers; i < nb; ++i) { - const uint8x8_t xq8 = vld1_u8(x->qs); - const int8x8_t xq8_0 = vsub_s8(vreinterpret_s8_u8(vand_u8(xq8, mask)), offset); - const int8x8_t xq8_1 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8, 2), mask)), offset); - const int8x8_t xq8_2 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8, 4), mask)), offset); - const int8x8_t xq8_3 = vsub_s8(vreinterpret_s8_u8(vand_u8(vshr_n_u8(xq8, 6), mask)), offset); - - const int8x16_t xq8_l = vcombine_s8(xq8_0, xq8_1); - const int8x16_t xq8_h = vcombine_s8(xq8_2, xq8_3); - - const int8x16_t yq8_l = vld1q_s8(y->qs + 0); - const int8x16_t yq8_h = vld1q_s8(y->qs + 16); - - const int16x8_t dot0 = vpaddlq_s8(vmulq_s8(xq8_l, yq8_l)); - const int16x8_t dot1 = vpaddlq_s8(vmulq_s8(xq8_h, yq8_h)); - - sumf0 += GGML_FP16_TO_FP32(y->d) * (float) vaddlvq_s16(vaddq_s16(dot0, dot1)); - x += 1; - y += 1; - } - - *s = sumf0 + sumf1; -#else - - float sumf = 0.0f; - for (int i = 0; i < nb; i++) { - int sumi = 0; - for (int j = 0; j < qk / 4; j++) { - const uint8_t weight = x[i].qs[j]; - sumi += (int)y[i].qs[j + 0*qk/4] * (((weight >> 0) & 3) - 2); - sumi += (int)y[i].qs[j + 1*qk/4] * (((weight >> 2) & 3) - 2); - sumi += (int)y[i].qs[j + 2*qk/4] * (((weight >> 4) & 3) - 2); - sumi += (int)y[i].qs[j + 3*qk/4] * (((weight >> 6) & 3) - 2); - } - sumf += (float)(sumi)*(GGML_FP16_TO_FP32(y[i].d)); - } - *s = sumf; -#endif -} - void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; @@ -11855,225 +11596,6 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { } #endif -void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - // assumed by the code below - assert(n % QK1_3 == 0); - static_assert(QK1_3 == 2 * QK8_0, "QK1_3 must be 2 times bigger than QK8_0"); - - const block_q1_3 * restrict x = vx; - const block_q8_0 * restrict y = vy; - - const int nb = n / QK1_3; - -#if defined(__AVX2__) - __m256 accumf = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - // const __m128i x12a = _mm_maskload_epi32((const int32_t *) x, _mm_set_epi32(0, -1, -1, -1)); - // const __m128i x13b = _mm_insert_epi8(x12a, x->qs[0], 12); - // WARNING: reading 3 bytes further than necessary. - // It's measurably faster than a masked load on an Intel Core m3-8100Y - const __m128i x13b = _mm_loadu_si128((const __m128i *) x); - const __m256i x13 = MM256_SET_M128I(x13b, x13b); - - { - // pre-shift the values by 8 bits, and prepare the layout for later packing - __m256i x0l = _mm256_shuffle_epi8(x13, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, - 4, -1, 4, -1, 4, -1, 4, -1, - 1, -1, 1, -1, 1, -1, 1, -1, - 0, -1, 0, -1, 0, -1, 0, -1)); - __m256i x0h = _mm256_shuffle_epi8(x13, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, - 6, -1, 6, -1, 6, -1, 6, -1, - 3, -1, 3, -1, 3, -1, 3, -1, - 2, -1, 2, -1, 2, -1, 2, -1)); - __m256i x1l = _mm256_shuffle_epi8(x13, _mm256_set_epi8(7, -1, 6, -1, 5, -1, 4, -1, - 3, -1, 2, -1, 1, -1, 0, -1, - 9, -1, 9, -1, 9, -1, 9, -1, - 8, -1, 8, -1, 8, -1, 8, -1)); - __m256i x1h = _mm256_shuffle_epi8(x13, _mm256_set_epi8(12, -1, 12, -1, 12, -1, 12, -1, - 11, -1, 10, -1, 9, -1, 8, -1, - 11, -1, 11, -1, 11, -1, 11, -1, - 10, -1, 10, -1, 10, -1, 10, -1)); - const __m256i shift0 = _mm256_set_epi16(3, 9, 27, 81, - 3, 9, 27, 81, - 3, 9, 27, 81, - 3, 9, 27, 81); - const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1, - 1, 1, 1, 1, - 3, 9, 27, 81, - 3, 9, 27, 81); - const __m256i shift1h = _mm256_set_epi16(3, 9, 27, 81, - 1, 1, 1, 1, - 3, 9, 27, 81, - 3, 9, 27, 81); - // extract ternary values - // first by shifting the numbers to make each one the next significant digit - x0l = _mm256_mullo_epi16(x0l, shift0); - x0h = _mm256_mullo_epi16(x0h, shift0); - x1l = _mm256_mullo_epi16(x1l, shift1l); - x1h = _mm256_mullo_epi16(x1h, shift1h); - // then by extracting each of these most significant digits - x0l = _mm256_mulhi_epu16(x0l, _mm256_set1_epi16(3)); - x0h = _mm256_mulhi_epu16(x0h, _mm256_set1_epi16(3)); - x1l = _mm256_mulhi_epu16(x1l, _mm256_set1_epi16(3)); - x1h = _mm256_mulhi_epu16(x1h, _mm256_set1_epi16(3)); - - __m256i x0 = _mm256_packs_epi16(x0l, x0h); - __m256i x1 = _mm256_packs_epi16(x1l, x1h); - - // 0, 1, 2 => -1, 0, 1 - x0 = _mm256_sub_epi8(x0, _mm256_set1_epi8(1)); - x1 = _mm256_sub_epi8(x1, _mm256_set1_epi8(1)); - - const __m256i y0 = _mm256_loadu_si256((const __m256i *) (y[0].qs)); - const __m256i y1 = _mm256_loadu_si256((const __m256i *) (y[1].qs)); - - const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)); - const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)); - - const __m256 q0 = mul_sum_i8_pairs_float(y0, x0); - const __m256 q1 = mul_sum_i8_pairs_float(y1, x1); - - accumf = _mm256_fmadd_ps(d0, q0, accumf); - accumf = _mm256_fmadd_ps(d1, q1, accumf); - } - - x += 1; - y += 2; - } - - *s = hsum_float_8(accumf); -#elif defined(__ARM_NEON) - - static const uint8_t k_mask0[16] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; - static const uint8_t k_mask1[16] = {4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7}; - static const uint8_t k_mask2[16] = {8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11}; - static const uint8_t k_mask3[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12}; - - static const uint8_t k_shift0[16] = {81, 27, 9, 3, 81, 27, 9, 3, 81, 27, 9, 3, 81, 27, 9, 3}; - static const uint8_t k_shift3[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 81, 27, 9, 3}; - - // float32x4_t sumv0 = vdupq_n_f32(0.0f); - // float32x4_t sumv1 = vdupq_n_f32(0.0f); - - float sumf0 = 0.0f; - float sumf1 = 0.0f; - - const uint8x16_t mask0 = vld1q_u8(k_mask0); - const uint8x16_t mask1 = vld1q_u8(k_mask1); - const uint8x16_t mask2 = vld1q_u8(k_mask2); - const uint8x16_t mask3 = vld1q_u8(k_mask3); - - const uint8x16_t shift0 = vld1q_u8(k_shift0); - const uint8x16_t shift3 = vld1q_u8(k_shift3); - - const int8x16_t one = vdupq_n_s8(1); - - for (int i = 0; i < nb; ++i) { - // WARNING: reading 3 bytes further than necessary - const uint8x16_t x13b = vld1q_u8((const uint8_t *) x); - - uint8x16_t x0 = ggml_vqtbl1q_u8(x13b, mask0); - uint8x16_t x1 = ggml_vqtbl1q_u8(x13b, mask1); - uint8x16_t x2 = ggml_vqtbl1q_u8(x13b, mask2); - uint8x16_t x3 = ggml_vqtbl1q_u8(x13b, mask3); - - x0 = vmulq_u8(x0, shift0); - x1 = vmulq_u8(x1, shift0); - x2 = vmulq_u8(x2, shift0); - x3 = vmulq_u8(x3, shift3); - - // multiply by 3 and keep the 2 bits above 8 bits - x0 = vshrq_n_u8(vhaddq_u8(x0, vshrq_n_u8(x0, 1)), 6); - x1 = vshrq_n_u8(vhaddq_u8(x1, vshrq_n_u8(x1, 1)), 6); - x2 = vshrq_n_u8(vhaddq_u8(x2, vshrq_n_u8(x2, 1)), 6); - x3 = vshrq_n_u8(vhaddq_u8(x3, vshrq_n_u8(x3, 1)), 6); - - // 0, 1, 2 => -1, 0, 1 - int8x16_t x0i = vsubq_s8(vreinterpretq_s8_u8(x0), one); - int8x16_t x1i = vsubq_s8(vreinterpretq_s8_u8(x1), one); - int8x16_t x2i = vsubq_s8(vreinterpretq_s8_u8(x2), one); - int8x16_t x3i = vsubq_s8(vreinterpretq_s8_u8(x3), one); - - const int8x16_t y0 = vld1q_s8(y[0].qs + 0); - const int8x16_t y1 = vld1q_s8(y[0].qs + 16); - const int8x16_t y2 = vld1q_s8(y[1].qs + 0); - const int8x16_t y3 = vld1q_s8(y[1].qs + 16); - - // const int32x4_t p0 = vpaddlq_s16(vaddq_s16(vpaddlq_s8(x0i), vpaddlq_s8(x1i))); - // const int32x4_t p1 = vpaddlq_s16(vaddq_s16(vpaddlq_s8(x2i), vpaddlq_s8(x3i))); - - // there's no direct equivalent to _mm_sign_epi8, unfortunately - x0i = vmulq_s8(x0i, y0); - x1i = vmulq_s8(x1i, y1); - x2i = vmulq_s8(x2i, y2); - x3i = vmulq_s8(x3i, y3); - - // overall 18.5% faster than with vector sums on a cortex-A72 - sumf0 += GGML_FP16_TO_FP32(y[0].d) * (float) vaddlvq_s16(vaddq_s16(vpaddlq_s8(x0i), vpaddlq_s8(x1i))); - sumf1 += GGML_FP16_TO_FP32(y[1].d) * (float) vaddlvq_s16(vaddq_s16(vpaddlq_s8(x2i), vpaddlq_s8(x3i))); - - // const int32x4_t p0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), x0i, y0), x1i, y1); - // const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), x2i, y2), x3i, y3); - - // sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p0), GGML_FP16_TO_FP32(y[0].d)); - // sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p1), GGML_FP16_TO_FP32(y[1].d)); - - y += 2; - x += 1; - } - - // *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); - *s = sumf0 + sumf1; -#else - float sumf = 0.0f; - - for (int i = 0; i < nb; ++i) { - int sum = 0; - for (int j = 0; j < 8; ++j) { - const int8_t * xj = (const int8_t *) (q1_3_grid + x[i].q[j]); - for (int k = 0; k < 4; ++k) { - sum += xj[k] * (int16_t) y->qs[4*j + k]; - } - } - - sumf += GGML_FP16_TO_FP32(y->d) * sum; - y += 1; - sum = 0; - - for (int j = 0; j < 4; ++j) { - const int8_t * xj = (const int8_t *) (q1_3_grid + x[i].q[8 + j]); - for (int k = 0; k < 4; ++k) { - sum += xj[k] * (int16_t) y->qs[4*j + k]; - } - } - - for (size_t j = 0; j < 12; ++j) { - uint16_t xj = x[i].q[j]; - xj = (xj * 3) >> 8; - sum += ((int16_t) xj - 1) * (int16_t) y->qs[16 + j]; - } - - { - const int8_t * xj = (const int8_t *) (q1_3_grid + x[i].qs[0]); - for (int k = 0; k < 4; ++k) { - sum += (int16_t) xj[k] * (int16_t) y->qs[28 + k]; - } - } - - sumf += GGML_FP16_TO_FP32(y->d) * sum; - y += 1; - } - - *s = sumf; -#endif -} - void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -15964,8 +15486,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8); } break; - case GGML_TYPE_Q1_3: - case GGML_TYPE_Q2_2: + case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 238cfd3fb7e42..df9c4b24ae74f 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -12,8 +12,6 @@ extern "C" { #endif // Quantization -void quantize_row_q1_3_ref(const float * GGML_RESTRICT x, block_q1_3 * GGML_RESTRICT y, int64_t k); -void quantize_row_q2_2_ref(const float * GGML_RESTRICT x, block_q2_2 * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); @@ -37,8 +35,6 @@ void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGM void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); -void quantize_row_q1_3(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q2_2(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -63,8 +59,6 @@ void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); // Dequantization -void dequantize_row_q1_3(const block_q1_3 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q2_2(const block_q2_2 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -93,8 +87,6 @@ void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_ void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); // Dot product -void ggml_vec_dot_q1_3_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q2_2_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -139,8 +131,6 @@ size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q1_3(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q2_2(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 5bbe0e4a89b4c..b56ebcb7cdb58 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -856,30 +856,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, - [GGML_TYPE_Q2_2] = { - .type_name = "q2_2", - .blck_size = QK2_2, - .type_size = sizeof(block_q2_2), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q2_2, - .from_float = quantize_row_q2_2, - .from_float_ref = (ggml_from_float_t) quantize_row_q2_2_ref, - .vec_dot = ggml_vec_dot_q2_2_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - }, - [GGML_TYPE_Q1_3] = { - .type_name = "q1_3", - .blck_size = QK1_3, - .type_size = sizeof(block_q1_3), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q1_3, - .from_float = quantize_row_q1_3, - .from_float_ref = (ggml_from_float_t) quantize_row_q1_3_ref, - .vec_dot = ggml_vec_dot_q1_3_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - }, [GGML_TYPE_IQ1_S] = { .type_name = "iq1_s", .blck_size = QK_K, @@ -13936,8 +13912,6 @@ static void ggml_compute_forward_clamp( } break; case GGML_TYPE_F16: case GGML_TYPE_BF16: - case GGML_TYPE_Q1_3: - case GGML_TYPE_Q2_2: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -20638,8 +20612,6 @@ size_t ggml_quantize_chunk( size_t result = 0; switch (type) { - case GGML_TYPE_Q1_3: result = quantize_q1_3(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q2_2: result = quantize_q2_2(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 32e73e56aa424..57aa53b410f23 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1150,8 +1150,6 @@ class GGMLQuantizationType(IntEnum): Q4_0_8_8 = 33 TQ1_0 = 34 TQ2_0 = 35 - Q1_3 = 36 - Q2_2 = 37 # TODO: add GGMLFileType from ggml_ftype in ggml.h @@ -1193,8 +1191,11 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_XS = 30 # except 1d tensors MOSTLY_IQ1_M = 31 # except 1d tensors MOSTLY_BF16 = 32 # except 1d tensors - MOSTLY_Q2_2 = 33 # except 1d tensors - MOSTLY_Q1_3 = 34 # except 1d tensors + MOSTLY_Q4_0_4_4 = 33 # except 1d tensors + MOSTLY_Q4_0_4_8 = 34 # except 1d tensors + MOSTLY_Q4_0_8_8 = 35 # except 1d tensors + MOSTLY_TQ1_0 = 36 # except 1d tensors + MOSTLY_TQ2_0 = 37 # except 1d tensors GUESSED = 1024 # not specified in the model file @@ -1268,8 +1269,11 @@ def get_type(val: Any) -> GGUFValueType: GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), GGMLQuantizationType.BF16: (1, 2), - GGMLQuantizationType.Q2_2: (32, 8), - GGMLQuantizationType.Q1_3: (64, 12 + 1), + GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16), + GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16), + GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16), + GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), + GGMLQuantizationType.TQ2_0: (256, 2 + 64), } diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 8f7fd0232a7ce..16e0a9aaa8a8b 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -121,55 +121,3 @@ def quantize_q8_0(data: np.ndarray): return __quantize_q8_0_lazy(data) else: return __quantize_q8_0_array(data) - - -__q1_3_block_size, __q1_3_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q1_3] - - -def can_quantize_to_q1_3(n: np.ndarray) -> bool: - return n.shape[-1] % __q1_3_block_size == 0 - - -def __quantize_q1_3_shape_change(s: tuple[int, ...]) -> tuple[int, ...]: - return (*s[:-1], s[-1] // __q1_3_block_size * __q1_3_type_size) - - -def __quantize_q1_3_rows(n: np.ndarray) -> np.ndarray: - shape = n.shape - assert shape[-1] % __q1_3_block_size == 0 - - n_blocks = n.size // __q1_3_block_size - - blocks = n.reshape((n_blocks, __q1_3_block_size)).astype(np.float32, copy=False) - - # assuming the weights are pre-scaled - blocks = (np.sign(blocks).astype(np.int8) + 1).view(np.uint8) - q48, rest = np.hsplit(blocks, (48,)) - q12, q4 = np.hsplit(rest, (12,)) - - pow3 = np.array([1, 3, 9, 27]) - q48 = q48.reshape((n_blocks, 12, 4)) - q48 = np.sum(q48 * pow3.reshape((1, 1, 4)), axis=2, keepdims=True).reshape((n_blocks, 12)) - q4 = np.sum(q4 * pow3.reshape((1, 4)), axis=1, keepdims=True) - q48 = q48 + (q12 * 81) - q = np.concatenate([q48, q4], axis=1) - q = (((q.astype(np.uint16) * 256) + (243 - 1)) // 243).astype(np.uint8) - - return q.reshape(__quantize_q1_3_shape_change(shape)) - - -def __quantize_q1_3_array(n: np.ndarray) -> np.ndarray: - return __apply_over_grouped_rows(__quantize_q1_3_rows, arr=n, otype=np.uint8, oshape=__quantize_q1_3_shape_change(n.shape)) - - -__quantize_q1_3_lazy = LazyNumpyTensor._wrap_fn( - __quantize_q1_3_array, - meta_noop=(np.uint8, __quantize_q1_3_shape_change), -) - - -def quantize_q1_3(data: np.ndarray): - if type(data) is LazyNumpyTensor: - return __quantize_q1_3_lazy(data) - else: - return __quantize_q1_3_array(data) diff --git a/include/llama.h b/include/llama.h index 7dcc260e8faba..be039e45fb62e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -168,8 +168,6 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q1_3 = 38, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q2_2 = 39, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 7ab2b47cd6ee5..bd52975db8543 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4451,8 +4451,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "F16"; case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; - case LLAMA_FTYPE_MOSTLY_Q1_3: return "Q1_3 - 1.625 bpw for BitNet b1.58"; - case LLAMA_FTYPE_MOSTLY_Q2_2: return "Q2_2 - 2.000 bpw for BitNet b1.58"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; @@ -7347,23 +7345,23 @@ static bool llm_load_tensors( layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}); layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); - layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}); + layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); - layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1}); + layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); - layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1}); + layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}); + layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}); layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); - layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}); + layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}); + layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); - layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}); + layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED); } } break; case LLM_ARCH_T5: @@ -13028,7 +13026,9 @@ struct llm_build_context { { // compute Q and K and RoPE them struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); + if (model.layers[il].wq_scale) { + Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); + } cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); @@ -13037,7 +13037,9 @@ struct llm_build_context { // B1.K struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); - Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); + if (model.layers[il].wk_scale) { + Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); + } cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); @@ -13046,7 +13048,9 @@ struct llm_build_context { // B1.V struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); - Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); + if (model.layers[il].wv_scale) { + Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); + } cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -13077,7 +13081,9 @@ struct llm_build_context { cb(cur, "attn_sub_norm", il); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); - cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); + if (model.layers[il].wo_scale) { + cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); + } if (model.layers[il].bo) { cur = ggml_add(ctx0, cur, model.layers[il].bo); } @@ -13114,7 +13120,9 @@ struct llm_build_context { cb(cur, "ffn_sub_norm", il); cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur); - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); + if (model.layers[il].ffn_down_scale) { + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); + } cb(cur, "ffn_down", il); cur = ggml_add(ctx0, cur, ffn_inp); @@ -15631,8 +15639,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llama_ftype ftype = params->ftype; switch (params->ftype) { - case LLAMA_FTYPE_MOSTLY_Q1_3: default_type = GGML_TYPE_Q1_3; break; - case LLAMA_FTYPE_MOSTLY_Q2_2: default_type = GGML_TYPE_Q2_2; break; case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index a6e508d017d16..ccf5721a3ab83 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -15,13 +15,13 @@ constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; -constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.015625f; // TODO: change to 0.01f +constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f; constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f; constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f; -constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.5f; // TODO: change to 0.15f +constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f; static const char* RESULT_STR[] = {"ok", "FAILED"}; @@ -146,8 +146,6 @@ int main(int argc, char * argv[]) { if (qfns.from_float && qfns.to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = - type == GGML_TYPE_Q1_3 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : - type == GGML_TYPE_Q2_2 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : @@ -172,7 +170,7 @@ int main(int argc, char * argv[]) { const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT - : type == GGML_TYPE_Q2_2 || type == GGML_TYPE_Q1_3 || type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 + : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 ? MAX_DOT_PRODUCT_ERROR_TERNARY : MAX_DOT_PRODUCT_ERROR; failed = !(vec_dot_error < max_allowed_error); From f034aa1bb1eddb528883135e9da0b66ad5afbb05 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 3 Aug 2024 16:22:04 -0400 Subject: [PATCH 21/28] ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency --- ggml/src/ggml-common.h | 6 +-- ggml/src/ggml-quants.c | 86 +++++++++++++++++++++--------------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index c65614696fc54..050161393456e 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -233,15 +233,15 @@ static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong // 1.6875 bpw typedef struct { - uint8_t q[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256) - uint8_t qs[QK_K/64]; // 4 elements per byte + uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256) + uint8_t qh[QK_K/64]; // 4 elements per byte ggml_half d; } block_tq1_0; static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding"); // 2.0625 bpw typedef struct { - uint8_t q[QK_K/4]; // 2 bits per element + uint8_t qs[QK_K/4]; // 2 bits per element ggml_half d; } block_tq2_0; static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding"); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index a2fd0563c8cfe..0caf619f2c6a8 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3326,7 +3326,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, y[i].d = GGML_FP32_TO_FP16(d); // 5 elements per byte, along 32 bytes - for (size_t j = 0; j < sizeof(y->q) - sizeof(y->q) % 32; j += 32) { + for (size_t j = 0; j < sizeof(y->qs) - sizeof(y->qs) % 32; j += 32) { for (size_t m = 0; m < 32; ++m) { uint8_t q = 0; for (size_t n = 0; n < 5; ++n) { @@ -3336,12 +3336,12 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, } // ceiling division (243 == pow(3, 5)) q = ((uint16_t)q * 256 + (243 - 1)) / 243; - y[i].q[j + m] = q; + y[i].qs[j + m] = q; } x += 5*32; } // along 16 bytes - for (size_t j = sizeof(y->q) - sizeof(y->q) % 32; j < sizeof(y->q); j += 16) { + for (size_t j = sizeof(y->qs) - sizeof(y->qs) % 32; j < sizeof(y->qs); j += 16) { for (size_t m = 0; m < 16; ++m) { uint8_t q = 0; for (size_t n = 0; n < 5; ++n) { @@ -3351,16 +3351,16 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, } // ceiling division (243 == pow(3, 5)) q = ((uint16_t)q * 256 + (243 - 1)) / 243; - y[i].q[j + m] = q; + y[i].qs[j + m] = q; } x += 5*16; } // 4 elements per byte - for (size_t j = 0; j < sizeof(y->qs); ++j) { + for (size_t j = 0; j < sizeof(y->qh); ++j) { uint8_t q = 0; for (size_t m = 0; m < 4; ++m) { // -1, 0, 1 -> 0, 1, 2 - int xi = nearest_int(x[j + m*sizeof(y->qs)] * id) + 1; + int xi = nearest_int(x[j + m*sizeof(y->qh)] * id) + 1; q *= 3; q += xi; } @@ -3368,9 +3368,9 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, q *= 3; // ceiling division (243 == pow(3, 5)) q = ((uint16_t)q * 256 + (243 - 1)) / 243; - y[i].qs[j] = q; + y[i].qh[j] = q; } - x += 4*sizeof(y->qs); + x += 4*sizeof(y->qh); } } @@ -3392,7 +3392,7 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, y[i].d = GGML_FP32_TO_FP16(d); // TODO: should it be along 64 bytes instead for AVX512? - for (size_t j = 0; j < sizeof(y->q); j += 32) { + for (size_t j = 0; j < sizeof(y->qs); j += 32) { for (size_t m = 0; m < 32; ++m) { uint8_t q = 0; for (size_t n = 0; n < 4; ++n) { @@ -3400,7 +3400,7 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int xi = nearest_int(x[m + n*32] * id) + 1; q += (xi & 3) << (2*n); } - y[i].q[j + m] = q; + y[i].qs[j + m] = q; } x += 4*32; } @@ -3444,19 +3444,19 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in const float d = GGML_FP16_TO_FP32(x[i].d); - for (size_t j = 0; j < sizeof(x->q) - sizeof(x->q) % 32; j += 32) { + for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) { for (size_t n = 0; n < 5; ++n) { for (size_t m = 0; m < 32; ++m) { - uint8_t q = x[i].q[j + m] * pow3[n]; + uint8_t q = x[i].qs[j + m] * pow3[n]; int16_t xi = ((uint16_t) q * 3) >> 8; *y++ = (float) (xi - 1) * d; } } } - for (size_t j = sizeof(x->q) - sizeof(x->q) % 32; j < sizeof(x->q); j += 16) { + for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) { for (size_t n = 0; n < 5; ++n) { for (size_t m = 0; m < 16; ++m) { - uint8_t q = x[i].q[j + m] * pow3[n]; + uint8_t q = x[i].qs[j + m] * pow3[n]; int16_t xi = ((uint16_t) q * 3) >> 8; *y++ = (float) (xi - 1) * d; } @@ -3464,8 +3464,8 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in } for (size_t n = 0; n < 4; ++n) { - for (size_t j = 0; j < sizeof(x->qs); ++j) { - uint8_t q = x[i].qs[j] * pow3[n]; + for (size_t j = 0; j < sizeof(x->qh); ++j) { + uint8_t q = x[i].qh[j] * pow3[n]; int16_t xi = ((uint16_t) q * 3) >> 8; *y++ = (float) (xi - 1) * d; } @@ -3481,10 +3481,10 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in const float d = GGML_FP16_TO_FP32(x[i].d); - for (size_t j = 0; j < sizeof(x->q); j += 32) { + for (size_t j = 0; j < sizeof(x->qs); j += 32) { for (size_t l = 0; l < 4; ++l) { for (size_t m = 0; m < 32; ++m) { - int8_t q = (x[i].q[j + m] >> (l*2)) & 3; + int8_t q = (x[i].qs[j + m] >> (l*2)) & 3; *y++ = (float) (q - 1) * d; } } @@ -5681,8 +5681,8 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * // first 32 bytes of 5 elements { - uint8x16_t qx0 = vld1q_u8(x[i].q + 0); - uint8x16_t qx1 = vld1q_u8(x[i].q + 16); + uint8x16_t qx0 = vld1q_u8(x[i].qs + 0); + uint8x16_t qx1 = vld1q_u8(x[i].qs + 16); uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3)); uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3)); uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9)); @@ -5739,14 +5739,14 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * // last 16 bytes of 5-element, along with the 4 bytes of 4 elements { - uint8x16_t qx0 = vld1q_u8(x[i].q + 32); + uint8x16_t qx0 = vld1q_u8(x[i].qs + 32); uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3)); uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9)); uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27)); uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81)); - uint32_t qs; - memcpy(&qs, x[i].qs, sizeof(qs)); // potentially unaligned - uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qs)); + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned + uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh)); qx5 = vmulq_u8(qx5, shift); // multiply by 3 and keep the 2 bits above 8 bits @@ -5802,7 +5802,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * // first 32 bytes of 5 elements { - __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].q)); + __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs)); // 8-bit multiplies with shifts, masks and adds __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3 __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9 @@ -5848,10 +5848,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * // last 16 bytes of 5-element, along with the 4 bytes of 4 elements { - __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].q + 32)); - uint32_t qs; - memcpy(&qs, x[i].qs, sizeof(qs)); // potentially unaligned - __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qs)); + __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32)); + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned + __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh)); __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3 __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9 __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9 @@ -5911,19 +5911,19 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * for (int i = 0; i < nb; ++i) { int sum = 0; - for (size_t j = 0; j < sizeof(x->q) - sizeof(x->q) % 32; j += 32) { + for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) { for (size_t l = 0; l < 5; ++l) { for (size_t m = 0; m < 32; ++m) { - uint8_t q = x[i].q[j + m] * pow3[l]; + uint8_t q = x[i].qs[j + m] * pow3[l]; uint16_t xi = ((uint16_t) q * 3) >> 8; sum += (xi - 1) * y[i].qs[j*5 + l*32 + m]; } } } - for (size_t j = sizeof(x->q) - sizeof(x->q) % 32; j < sizeof(x->q); j += 16) { + for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) { for (size_t l = 0; l < 5; ++l) { for (size_t m = 0; m < 16; ++m) { - uint8_t q = x[i].q[j + m] * pow3[l]; + uint8_t q = x[i].qs[j + m] * pow3[l]; uint16_t xi = ((uint16_t) q * 3) >> 8; sum += (xi - 1) * y[i].qs[j*5 + l*16 + m]; } @@ -5931,10 +5931,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * } for (size_t l = 0; l < 4; ++l) { - for (size_t j = 0; j < sizeof(x->qs); ++j) { - uint8_t q = x[i].qs[j] * pow3[l]; + for (size_t j = 0; j < sizeof(x->qh); ++j) { + uint8_t q = x[i].qh[j] * pow3[l]; uint16_t xi = ((uint16_t) q * 3) >> 8; - sum += (xi - 1) * y[i].qs[sizeof(x->q)*5 + l*sizeof(x->qs) + j]; + sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j]; } } @@ -5966,9 +5966,9 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * int16x8_t sumi0 = vdupq_n_s16(0); int16x8_t sumi1 = vdupq_n_s16(0); - for (size_t j = 0; j < sizeof(x->q); j += 32) { - uint8x16_t qx0 = vld1q_u8(x[i].q + j); - uint8x16_t qx1 = vld1q_u8(x[i].q + j + 16); + for (size_t j = 0; j < sizeof(x->qs); j += 32) { + uint8x16_t qx0 = vld1q_u8(x[i].qs + j); + uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16); uint8x16_t qx2 = vshrq_n_u8(qx0, 2); uint8x16_t qx3 = vshrq_n_u8(qx1, 2); uint8x16_t qx4 = vshrq_n_u8(qx0, 4); @@ -6033,8 +6033,8 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * __m256i sumi0 = _mm256_setzero_si256(); __m256i sumi1 = _mm256_setzero_si256(); - for (size_t j = 0; j < sizeof(x->q); j += 32) { - __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].q + j)); + for (size_t j = 0; j < sizeof(x->qs); j += 32) { + __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j)); __m256i qx1 = _mm256_srli_epi16(qx0, 2); __m256i qx2 = _mm256_srli_epi16(qx0, 4); __m256i qx3 = _mm256_srli_epi16(qx0, 6); @@ -6077,10 +6077,10 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * for (int i = 0; i < nb; ++i) { int32_t sumi = 0; - for (size_t j = 0; j < sizeof(x->q); j += 32) { + for (size_t j = 0; j < sizeof(x->qs); j += 32) { for (size_t l = 0; l < 4; ++l) { for (size_t k = 0; k < 32; ++k) { - sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].q[j + k] >> (l*2)) & 3) - 1); + sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1); } } } From 96b3d411e0469c478100ec72ae5fb758544dc3ba Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 7 Aug 2024 15:04:13 -0400 Subject: [PATCH 22/28] ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot Not yet tested on harware which supports it, might not work or might not even compile. But also it might. It should make the performance better on recent ARM CPUs. * ggml-quants : remove comment about possible format change of TQ2_0 Making it slightly more convenient for AVX512 but less convenient for everything else is not worth the trouble. --- ggml/src/ggml-quants.c | 63 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 0caf619f2c6a8..c39c1d30eea94 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3391,7 +3391,6 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, y[i].d = GGML_FP32_TO_FP16(d); - // TODO: should it be along 64 bytes instead for AVX512? for (size_t j = 0; j < sizeof(y->qs); j += 32) { for (size_t m = 0; m < 32; ++m) { uint8_t q = 0; @@ -5957,7 +5956,67 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * const int nb = n / QK_K; -#if defined __ARM_NEON +#if defined __ARM_NEON && defined __ARM_FEATURE_DOTPROD + float sumf = 0.0f; + + const uint8x16_t m3 = vdupq_n_u8(3); + + for (int i = 0; i < nb; ++i) { + int32x4_t sumi0 = vdupq_n_s32(0); + int32x4_t sumi1 = vdupq_n_s32(0); + + for (size_t j = 0; j < sizeof(x->qs); j += 32) { + uint8x16_t qx0 = vld1q_u8(x[i].qs + j); + uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16); + uint8x16_t qx2 = vshrq_n_u8(qx0, 2); + uint8x16_t qx3 = vshrq_n_u8(qx1, 2); + uint8x16_t qx4 = vshrq_n_u8(qx0, 4); + uint8x16_t qx5 = vshrq_n_u8(qx1, 4); + uint8x16_t qx6 = vshrq_n_u8(qx0, 6); + uint8x16_t qx7 = vshrq_n_u8(qx1, 6); + + int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3)); + int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3)); + int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3)); + int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3)); + int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3)); + int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3)); + int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3)); + int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3)); + + const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 + 0); + const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 + 16); + const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 + 32); + const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 + 48); + const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 + 64); + const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 + 80); + const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 + 96); + const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112); + + sumi0 = vdotq_s32(sumi0, sqx0, qy0); + sumi1 = vdotq_s32(sumi1, sqx1, qy1); + sumi0 = vdotq_s32(sumi0, sqx2, qy2); + sumi1 = vdotq_s32(sumi1, sqx3, qy3); + sumi0 = vdotq_s32(sumi0, sqx4, qy4); + sumi1 = vdotq_s32(sumi1, sqx5, qy5); + sumi0 = vdotq_s32(sumi0, sqx6, qy6); + sumi1 = vdotq_s32(sumi1, sqx7, qy7); + } + + const int16x8_t ysum0 = vld1q_s16(y[i].bsums); + const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); + + sumi0 = vaddq_s32(sumi0, sumi1); + sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); + + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + + sumf += d * (float) vaddvq_s32(sumi0); + } + + *s = sumf; + +#elif defined __ARM_NEON float sumf = 0.0f; const uint8x16_t m3 = vdupq_n_u8(3); From 3a0bf17d5737508fc28f8c1ad1efb206160a7770 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Mon, 12 Aug 2024 00:06:48 -0400 Subject: [PATCH 23/28] gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0 * ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0 This does not change anything for ternary models, since their values should never end up being in halfway cases anyway. --- ggml/src/ggml-quants.c | 8 ++-- gguf-py/gguf/quants.py | 81 ++++++++++++++++++++++++++++++++++++ gguf-py/tests/test_quants.py | 1 + 3 files changed, 86 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 3993e27baeac2..4fe89de8c3b81 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3330,7 +3330,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, for (size_t m = 0; m < 32; ++m) { uint8_t q = 0; for (size_t n = 0; n < 5; ++n) { - int xi = nearest_int(x[m + n*32] * id) + 1; // -1, 0, 1 -> 0, 1, 2 + int xi = lroundf(x[m + n*32] * id) + 1; // -1, 0, 1 -> 0, 1, 2 q *= 3; q += xi; } @@ -3345,7 +3345,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, for (size_t m = 0; m < 16; ++m) { uint8_t q = 0; for (size_t n = 0; n < 5; ++n) { - int xi = nearest_int(x[m + n*16] * id) + 1; // -1, 0, 1 -> 0, 1, 2 + int xi = lroundf(x[m + n*16] * id) + 1; // -1, 0, 1 -> 0, 1, 2 q *= 3; q += xi; } @@ -3360,7 +3360,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, uint8_t q = 0; for (size_t m = 0; m < 4; ++m) { // -1, 0, 1 -> 0, 1, 2 - int xi = nearest_int(x[j + m*sizeof(y->qh)] * id) + 1; + int xi = lroundf(x[j + m*sizeof(y->qh)] * id) + 1; q *= 3; q += xi; } @@ -3396,7 +3396,7 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, uint8_t q = 0; for (size_t n = 0; n < 4; ++n) { // -1, 0, 1 -> 0, 1, 2 - int xi = nearest_int(x[m + n*32] * id) + 1; + int xi = lroundf(x[m + n*32] * id) + 1; q += (xi & 3) << (2*n); } y[i].qs[j + m] = q; diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index ff589b85245e5..3c8ba82e19d3d 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -574,6 +574,87 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: return (d * q).reshape((n_blocks, QK_K)) +class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d = abs(blocks).max(axis=-1, keepdims=True) + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np_roundf(blocks * id) + qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8) + + qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):] + qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1)) + qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1)) + qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1)) + qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1)) + qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1)) + qh = np.sum(qh, axis=-2).reshape((n_blocks, -1)) + qs = np.concatenate([qs0, qs1, qh], axis=-1) + qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243 + + qs = qs.astype(np.uint8) + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([qs, d], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5]) + qh, d = np.hsplit(rest, [QK_K // 64]) + + d = d.view(np.float16).astype(np.float32) + + qs0, qs1 = qs[..., :32], qs[..., 32:] + qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1)) + qs0 = qs0.reshape((n_blocks, -1)) + qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1)) + qs1 = qs1.reshape((n_blocks, -1)) + qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1)) + qh = qh.reshape((n_blocks, -1)) + qs = np.concatenate([qs0, qs1, qh], axis=-1) + qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1) + + return (d * qs.astype(np.float32)) + + +class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d = abs(blocks).max(axis=-1, keepdims=True) + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np_roundf(blocks * id) + qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8) + + qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :] + qs = qs.reshape((n_blocks, -1)) + + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([qs, d], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + qs, d = np.hsplit(blocks, [QK_K // 4]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1) + + return (d * qs.astype(np.float32)) + + class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS): ksigns: bytes = ( b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f" diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py index 8b7a85c2c36d7..762067814224e 100755 --- a/gguf-py/tests/test_quants.py +++ b/gguf-py/tests/test_quants.py @@ -66,6 +66,7 @@ def __init__(self, libggml: Path): for t in ( "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "q2_K", "q3_K", "q4_K", "q5_K", "q6_K", + "tq1_0", "tq2_0", "iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m", "iq4_nl", "iq4_xs", ): From 895004f3f88fa6957ebc4c7f7a5b811da5db7d86 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Tue, 13 Aug 2024 17:17:43 -0400 Subject: [PATCH 24/28] convert : allow direct conversion to TQ1_0 and TQ2_0 The token embeddings and output tensors are kept in F16 to allow quantizing them to Q4_K and Q6_K with llama-quantize. * llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0 Q4_0 is not completely symmetric (so not lossless for ternary models), but it should be good enough. --- convert_hf_to_gguf.py | 24 ++++++++++++++++++++++-- src/llama.cpp | 2 ++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b3b585e98a305..ece4ad068a300 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -301,6 +301,20 @@ def prepare_tensors(self): ): data_qtype = gguf.GGMLQuantizationType.F32 + if data_qtype is False and any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + ) + ): + if self.ftype in ( + gguf.LlamaFileType.MOSTLY_TQ1_0, + gguf.LlamaFileType.MOSTLY_TQ2_0, + ): + # TODO: use Q4_K and Q6_K + data_qtype = gguf.GGMLQuantizationType.F16 + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) if isinstance(data_qtype, bool): if self.ftype == gguf.LlamaFileType.ALL_F32: @@ -311,6 +325,10 @@ def prepare_tensors(self): data_qtype = gguf.GGMLQuantizationType.BF16 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: + data_qtype = gguf.GGMLQuantizationType.TQ1_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: + data_qtype = gguf.GGMLQuantizationType.TQ2_0 else: raise ValueError(f"Unknown file type: {self.ftype.name}") @@ -3814,8 +3832,8 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", action="store_true", @@ -3902,6 +3920,8 @@ def main() -> None: "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, + "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, "auto": gguf.LlamaFileType.GUESSED, } diff --git a/src/llama.cpp b/src/llama.cpp index 2ad9755997b3e..5e54c3af4b7f8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15717,6 +15717,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } if (convert_incompatible_tensor) { switch (new_type) { + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: From 69f772682e5b7cdd9fa48f6a0745aa9794050e43 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Tue, 13 Aug 2024 17:21:19 -0400 Subject: [PATCH 25/28] ggml-quants : allow using ARM dot product instructions for TQ1_0 --- ggml/src/ggml-quants.c | 109 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 4fe89de8c3b81..0574bf4451614 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5667,7 +5667,114 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * const int nb = n / QK_K; -#if defined __ARM_NEON +#if defined __ARM_NEON && defined __ARM_FEATURE_DOTPROD + float sumf = 0.0f; + + uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27}; + + const uint8x16_t shift = vld1q_u8(k_shift); + + for (int i = 0; i < nb; ++i) { + int32x4_t sumi0 = vdupq_n_s32(0); + int32x4_t sumi1 = vdupq_n_s32(0); + + // first 32 bytes of 5 elements + { + uint8x16_t qx0 = vld1q_u8(x[i].qs + 0); + uint8x16_t qx1 = vld1q_u8(x[i].qs + 16); + uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3)); + uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3)); + uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9)); + uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9)); + uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27)); + uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27)); + uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81)); + uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81)); + + // multiply by 3 and keep the 2 bits above 8 bits + int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); + int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); + int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); + int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); + int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); + int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); + int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6)); + int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6)); + int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6)); + int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6)); + + const int8x16_t qy0 = vld1q_s8(y[i].qs + 0); + const int8x16_t qy1 = vld1q_s8(y[i].qs + 16); + const int8x16_t qy2 = vld1q_s8(y[i].qs + 32); + const int8x16_t qy3 = vld1q_s8(y[i].qs + 48); + const int8x16_t qy4 = vld1q_s8(y[i].qs + 64); + const int8x16_t qy5 = vld1q_s8(y[i].qs + 80); + const int8x16_t qy6 = vld1q_s8(y[i].qs + 96); + const int8x16_t qy7 = vld1q_s8(y[i].qs + 112); + const int8x16_t qy8 = vld1q_s8(y[i].qs + 128); + const int8x16_t qy9 = vld1q_s8(y[i].qs + 144); + + sumi0 = vdotq_s32(sumi0, sqx0, qy0); + sumi1 = vdotq_s32(sumi1, sqx1, qy1); + sumi0 = vdotq_s32(sumi0, sqx2, qy2); + sumi1 = vdotq_s32(sumi1, sqx3, qy3); + sumi0 = vdotq_s32(sumi0, sqx4, qy4); + sumi1 = vdotq_s32(sumi1, sqx5, qy5); + sumi0 = vdotq_s32(sumi0, sqx6, qy6); + sumi1 = vdotq_s32(sumi1, sqx7, qy7); + sumi0 = vdotq_s32(sumi0, sqx8, qy8); + sumi1 = vdotq_s32(sumi1, sqx9, qy9); + } + + // last 16 bytes of 5-element, along with the 4 bytes of 4 elements + { + uint8x16_t qx0 = vld1q_u8(x[i].qs + 32); + uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3)); + uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9)); + uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27)); + uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81)); + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned + uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh)); + qx5 = vmulq_u8(qx5, shift); + + // multiply by 3 and keep the 2 bits above 8 bits + int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); + int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); + int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); + int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); + int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); + int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); + + const int8x16_t qy0 = vld1q_s8(y[i].qs + 160); + const int8x16_t qy1 = vld1q_s8(y[i].qs + 176); + const int8x16_t qy2 = vld1q_s8(y[i].qs + 192); + const int8x16_t qy3 = vld1q_s8(y[i].qs + 208); + const int8x16_t qy4 = vld1q_s8(y[i].qs + 224); + const int8x16_t qy5 = vld1q_s8(y[i].qs + 240); + + sumi0 = vdotq_s32(sumi0, sqx0, qy0); + sumi1 = vdotq_s32(sumi1, sqx1, qy1); + sumi0 = vdotq_s32(sumi0, sqx2, qy2); + sumi1 = vdotq_s32(sumi1, sqx3, qy3); + sumi0 = vdotq_s32(sumi0, sqx4, qy4); + sumi1 = vdotq_s32(sumi1, sqx5, qy5); + } + + const int16x8_t ysum0 = vld1q_s16(y[i].bsums); + const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); + + sumi0 = vaddq_s32(sumi0, sumi1); + sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); + + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + + sumf += d * (float) vaddvq_s32(sumi0); + } + + *s = sumf; + +#elif defined __ARM_NEON float sumf = 0.0f; uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27}; From 35cc5567c8cce16443e97789ce428ce275922f34 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Tue, 13 Aug 2024 18:00:06 -0400 Subject: [PATCH 26/28] ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support --- ggml/src/ggml-quants.c | 204 ++++++++++------------------------------- 1 file changed, 47 insertions(+), 157 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 0574bf4451614..15e5297227854 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5667,7 +5667,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * const int nb = n / QK_K; -#if defined __ARM_NEON && defined __ARM_FEATURE_DOTPROD +#if defined(__ARM_NEON) float sumf = 0.0f; uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27}; @@ -5675,8 +5675,13 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * const uint8x16_t shift = vld1q_u8(k_shift); for (int i = 0; i < nb; ++i) { +#if defined(__ARM_FEATURE_DOTPROD) int32x4_t sumi0 = vdupq_n_s32(0); int32x4_t sumi1 = vdupq_n_s32(0); +#else + int16x8_t sumi0 = vdupq_n_s16(0); + int16x8_t sumi1 = vdupq_n_s16(0); +#endif // first 32 bytes of 5 elements { @@ -5714,6 +5719,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * const int8x16_t qy8 = vld1q_s8(y[i].qs + 128); const int8x16_t qy9 = vld1q_s8(y[i].qs + 144); +#if defined(__ARM_FEATURE_DOTPROD) sumi0 = vdotq_s32(sumi0, sqx0, qy0); sumi1 = vdotq_s32(sumi1, sqx1, qy1); sumi0 = vdotq_s32(sumi0, sqx2, qy2); @@ -5724,103 +5730,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * sumi1 = vdotq_s32(sumi1, sqx7, qy7); sumi0 = vdotq_s32(sumi0, sqx8, qy8); sumi1 = vdotq_s32(sumi1, sqx9, qy9); - } - - // last 16 bytes of 5-element, along with the 4 bytes of 4 elements - { - uint8x16_t qx0 = vld1q_u8(x[i].qs + 32); - uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3)); - uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9)); - uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27)); - uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81)); - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned - uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh)); - qx5 = vmulq_u8(qx5, shift); - - // multiply by 3 and keep the 2 bits above 8 bits - int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); - int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); - int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); - int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); - int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); - int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); - - const int8x16_t qy0 = vld1q_s8(y[i].qs + 160); - const int8x16_t qy1 = vld1q_s8(y[i].qs + 176); - const int8x16_t qy2 = vld1q_s8(y[i].qs + 192); - const int8x16_t qy3 = vld1q_s8(y[i].qs + 208); - const int8x16_t qy4 = vld1q_s8(y[i].qs + 224); - const int8x16_t qy5 = vld1q_s8(y[i].qs + 240); - - sumi0 = vdotq_s32(sumi0, sqx0, qy0); - sumi1 = vdotq_s32(sumi1, sqx1, qy1); - sumi0 = vdotq_s32(sumi0, sqx2, qy2); - sumi1 = vdotq_s32(sumi1, sqx3, qy3); - sumi0 = vdotq_s32(sumi0, sqx4, qy4); - sumi1 = vdotq_s32(sumi1, sqx5, qy5); - } - - const int16x8_t ysum0 = vld1q_s16(y[i].bsums); - const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); - - sumi0 = vaddq_s32(sumi0, sumi1); - sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - - sumf += d * (float) vaddvq_s32(sumi0); - } - - *s = sumf; - -#elif defined __ARM_NEON - float sumf = 0.0f; - - uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27}; - - const uint8x16_t shift = vld1q_u8(k_shift); - - for (int i = 0; i < nb; ++i) { - int16x8_t sumi0 = vdupq_n_s16(0); - int16x8_t sumi1 = vdupq_n_s16(0); - - // first 32 bytes of 5 elements - { - uint8x16_t qx0 = vld1q_u8(x[i].qs + 0); - uint8x16_t qx1 = vld1q_u8(x[i].qs + 16); - uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3)); - uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3)); - uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9)); - uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9)); - uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27)); - uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27)); - uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81)); - uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81)); - - // multiply by 3 and keep the 2 bits above 8 bits - int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); - int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); - int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); - int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); - int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); - int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); - int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6)); - int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6)); - int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6)); - int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6)); - - const int8x16_t qy0 = vld1q_s8(y[i].qs + 0); - const int8x16_t qy1 = vld1q_s8(y[i].qs + 16); - const int8x16_t qy2 = vld1q_s8(y[i].qs + 32); - const int8x16_t qy3 = vld1q_s8(y[i].qs + 48); - const int8x16_t qy4 = vld1q_s8(y[i].qs + 64); - const int8x16_t qy5 = vld1q_s8(y[i].qs + 80); - const int8x16_t qy6 = vld1q_s8(y[i].qs + 96); - const int8x16_t qy7 = vld1q_s8(y[i].qs + 112); - const int8x16_t qy8 = vld1q_s8(y[i].qs + 128); - const int8x16_t qy9 = vld1q_s8(y[i].qs + 144); - +#else sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); @@ -5841,6 +5751,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9)); sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9)); +#endif } // last 16 bytes of 5-element, along with the 4 bytes of 4 elements @@ -5870,6 +5781,14 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * const int8x16_t qy4 = vld1q_s8(y[i].qs + 224); const int8x16_t qy5 = vld1q_s8(y[i].qs + 240); +#if defined(__ARM_FEATURE_DOTPROD) + sumi0 = vdotq_s32(sumi0, sqx0, qy0); + sumi1 = vdotq_s32(sumi1, sqx1, qy1); + sumi0 = vdotq_s32(sumi0, sqx2, qy2); + sumi1 = vdotq_s32(sumi1, sqx3, qy3); + sumi0 = vdotq_s32(sumi0, sqx4, qy4); + sumi1 = vdotq_s32(sumi1, sqx5, qy5); +#else sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); @@ -5882,22 +5801,30 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); +#endif } const int16x8_t ysum0 = vld1q_s16(y[i].bsums); const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + +#if defined(__ARM_FEATURE_DOTPROD) + sumi0 = vaddq_s32(sumi0, sumi1); + sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); + + sumf += d * (float) vaddvq_s32(sumi0); +#else sumi0 = vaddq_s16(sumi0, sumi1); sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1)); - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - sumf += d * (float) vaddlvq_s16(sumi0); +#endif } *s = sumf; -#elif defined __AVX2__ +#elif defined(__AVX2__) __m256 sumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { @@ -6063,14 +5990,19 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * const int nb = n / QK_K; -#if defined __ARM_NEON && defined __ARM_FEATURE_DOTPROD +#if defined(__ARM_NEON) float sumf = 0.0f; const uint8x16_t m3 = vdupq_n_u8(3); for (int i = 0; i < nb; ++i) { +#if defined(__ARM_FEATURE_DOTPROD) int32x4_t sumi0 = vdupq_n_s32(0); int32x4_t sumi1 = vdupq_n_s32(0); +#else + int16x8_t sumi0 = vdupq_n_s16(0); + int16x8_t sumi1 = vdupq_n_s16(0); +#endif for (size_t j = 0; j < sizeof(x->qs); j += 32) { uint8x16_t qx0 = vld1q_u8(x[i].qs + j); @@ -6100,6 +6032,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 + 96); const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112); +#if defined(__ARM_FEATURE_DOTPROD) sumi0 = vdotq_s32(sumi0, sqx0, qy0); sumi1 = vdotq_s32(sumi1, sqx1, qy1); sumi0 = vdotq_s32(sumi0, sqx2, qy2); @@ -6108,58 +6041,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * sumi1 = vdotq_s32(sumi1, sqx5, qy5); sumi0 = vdotq_s32(sumi0, sqx6, qy6); sumi1 = vdotq_s32(sumi1, sqx7, qy7); - } - - const int16x8_t ysum0 = vld1q_s16(y[i].bsums); - const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); - - sumi0 = vaddq_s32(sumi0, sumi1); - sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - - sumf += d * (float) vaddvq_s32(sumi0); - } - - *s = sumf; - -#elif defined __ARM_NEON - float sumf = 0.0f; - - const uint8x16_t m3 = vdupq_n_u8(3); - - for (int i = 0; i < nb; ++i) { - int16x8_t sumi0 = vdupq_n_s16(0); - int16x8_t sumi1 = vdupq_n_s16(0); - - for (size_t j = 0; j < sizeof(x->qs); j += 32) { - uint8x16_t qx0 = vld1q_u8(x[i].qs + j); - uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16); - uint8x16_t qx2 = vshrq_n_u8(qx0, 2); - uint8x16_t qx3 = vshrq_n_u8(qx1, 2); - uint8x16_t qx4 = vshrq_n_u8(qx0, 4); - uint8x16_t qx5 = vshrq_n_u8(qx1, 4); - uint8x16_t qx6 = vshrq_n_u8(qx0, 6); - uint8x16_t qx7 = vshrq_n_u8(qx1, 6); - - int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3)); - int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3)); - int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3)); - int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3)); - int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3)); - int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3)); - int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3)); - int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3)); - - const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 + 0); - const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 + 16); - const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 + 32); - const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 + 48); - const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 + 64); - const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 + 80); - const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 + 96); - const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112); - +#else sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); @@ -6176,22 +6058,30 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6)); sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7)); sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7)); +#endif } const int16x8_t ysum0 = vld1q_s16(y[i].bsums); const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + +#if defined(__ARM_FEATURE_DOTPROD) + sumi0 = vaddq_s32(sumi0, sumi1); + sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); + + sumf += d * (float) vaddvq_s32(sumi0); +#else sumi0 = vaddq_s16(sumi0, sumi1); sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1)); - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - sumf += d * (float) vaddlvq_s16(sumi0); +#endif } *s = sumf; -#elif defined __AVX2__ +#elif defined(__AVX2__) __m256 sumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { From 8d61607656c7624f598e976ed31dc6ada7f7596d Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 4 Sep 2024 13:50:08 -0400 Subject: [PATCH 27/28] ggml ; remove unused ggml_mul special case It would otherwise conflict with the more general optimization coming with Mamba-2. * ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators --- ggml/src/ggml.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a629918fa00dd..c98ca32bd45bf 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -9921,6 +9921,8 @@ static void ggml_compute_forward_add( case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -10299,6 +10301,8 @@ static void ggml_compute_forward_add1( case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -10427,6 +10431,8 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -10562,16 +10568,7 @@ static void ggml_compute_forward_mul_f32( GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); - if (ggml_nelements(src1) == 1) { - float scale = ((float *) src1->data)[0]; - for (int64_t ir = ith; ir < nr; ir += nth) { - if (dst->data != src0->data) { - // src0 is same shape as dst => same indices - memcpy((char *)dst->data + ir*nb1, (char *)src0->data + ir*nb01, ne0 * sizeof(float)); - } - ggml_vec_scale_f32(ne0, (float *) ((char *) dst->data + ir*nb1), scale); - } - } else if (nb10 == sizeof(float)) { + if (nb10 == sizeof(float)) { for (int64_t ir = ith; ir < nr; ir += nth) { // src0 and dst are same shape => same indices const int64_t i03 = ir/(ne02*ne01); @@ -13419,6 +13416,8 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -13607,6 +13606,8 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: From 75b3a09602d36838f69daeb0adab54797f878c02 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 4 Sep 2024 14:01:25 -0400 Subject: [PATCH 28/28] test-backend-ops : add TQ1_0 and TQ2_0 comments for later Not yet adding uncommented, because some backends like SYCL and Metal do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT. (and Metal also doesn't handle it with GGML_OP_GET_ROWS) Support for TQ1_0 and TQ2_0 for other backends than CPU will be added in follow-up pull requests. --- tests/test-backend-ops.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index c832bc9569bbf..bd65e8cb36ba7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2200,6 +2200,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, + // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, @@ -2219,6 +2220,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, + // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,