From 1607a5e5b08f4e55f118af3d7de325949d8f1835 Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Fri, 15 Nov 2024 01:28:50 +0100 Subject: [PATCH 01/46] backend cpu: add online flow for aarch64 Q4_0 GEMV/GEMM kernels (#9921) * backend-cpu: add online flow for aarch64 Q4_0 GEMV/GEMM kernels --------- Co-authored-by: Diego Devesa --- Makefile | 4 + ggml/CMakeLists.txt | 1 + ggml/include/ggml-cpu.h | 3 + ggml/src/ggml-cpu/CMakeLists.txt | 5 + ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 144 +++++++++++++++++++++++++++ ggml/src/ggml-cpu/ggml-cpu-aarch64.h | 3 + ggml/src/ggml-cpu/ggml-cpu.c | 23 +++-- ggml/src/ggml-cpu/ggml-cpu.cpp | 106 ++++++++++++++++++-- src/llama.cpp | 2 +- 9 files changed, 271 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index de06cb8b0335e..87fe795aa8432 100644 --- a/Makefile +++ b/Makefile @@ -940,6 +940,10 @@ ggml/src/ggml-cuda/%.o: \ $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $< endif # GGML_MUSA +ifndef GGML_NO_CPU_AARCH64 + MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64 +endif + ifdef GGML_METAL MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 3e5b16f8629a5..4fb78e59fa72c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -92,6 +92,7 @@ else() endif() option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) +option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 4da62cb2b63f3..7571ef9798364 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -169,6 +169,9 @@ extern "C" { GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); #endif + GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); + GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 4d96f425e3c6e..8b0d60d4ec7b9 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -236,6 +236,11 @@ else() message(STATUS "Unknown architecture") endif() +if (GGML_CPU_AARCH64) + message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") + add_compile_definitions(GGML_USE_CPU_AARCH64) +endif() + target_compile_options(ggml-cpu PRIVATE "$<$:${ARCH_FLAGS}>") target_compile_options(ggml-cpu PRIVATE "$<$:${ARCH_FLAGS}>") diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c index 0ad9fe40a3e0a..b753ba767c15a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c @@ -3385,3 +3385,147 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } } + +// FIXME: this code is duplicated from ggml-aarch64.c +static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) { + block_q4_0x4 out; + + for (int i = 0; i < 4; i++) { + out.d[i] = in[i].d; + } + + for (int i = 0; i < QK4_0 * 2; i++) { + int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave; + int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave; + src_offset += (i % blck_size_interleave); + + out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask; + } + + return out; +} + +// interleave 8 block_q4_0s in blocks of blck_size_interleave +// returns an interleaved block_q4_0x8 +// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks +// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave +static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) { + block_q4_0x8 out; + + for (int i = 0; i < 8; i++) { + out.d[i] = in[i].d; + } + + for (int i = 0; i < QK4_0 * 4; i++) { + int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave; + int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave; + src_offset += (i % blck_size_interleave); + + out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask; + } + + return out; +} + +static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_Q4_0); + GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + + block_q4_0x4 * dst = (block_q4_0x4 *)t->data; + const block_q4_0 * src = (const block_q4_0 *)data; + block_q4_0 dst_tmp[4]; + int nrow = t->ne[1]; // Number of rows + int nrows_interleaved = 4; + int nblocks = t->ne[0] / QK4_0; + + GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); + + if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++) { + dst_tmp[i] = src[x + i * nblocks]; + } + *dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88); + } + src += nrows_interleaved * nblocks; + } + return 0; + + GGML_UNUSED(data_size); +} + +static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * restrict data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_Q4_0); + GGML_ASSERT(interleave_block == 8); + + block_q4_0x8 * dst = (block_q4_0x8*)t->data; + const block_q4_0 * src = (const block_q4_0*) data; + block_q4_0 dst_tmp[8]; + int nrow = t->ne[1]; // Number of rows + int nrows_interleaved = 8; + int nblocks = t->ne[0] / QK4_0; + + GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); + + if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++ ) { + dst_tmp[i] = src[x + i * nblocks]; + } + *dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88); + } + src += nrows_interleaved * nblocks; + } + return 0; + + GGML_UNUSED(data_size); +} + +// Prepare for optimized kernels if applicable +void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) { + if (cur->type == repack_type) { + memcpy(cur->data, data, data_size); + return; + } + + GGML_ASSERT(cur->type == GGML_TYPE_Q4_0); + + switch (repack_type) { + case GGML_TYPE_Q4_0_8_8: + repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size); + break; + case GGML_TYPE_Q4_0_4_8: + repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size); + break; + case GGML_TYPE_Q4_0_4_4: + repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size); + break; + default: + GGML_ABORT("Unsupported type"); + } +} + +enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { + if (cur->type == GGML_TYPE_Q4_0) { + // TODO: enable for AVX2 - currently disabled due to bad gemv performance + if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + return GGML_TYPE_Q4_0_8_8; + } + if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { + return GGML_TYPE_Q4_0_4_8; + } + if (ggml_cpu_has_neon()) { + return GGML_TYPE_Q4_0_4_4; + } + } + + return cur->type; +} diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h index 203802f07320c..53b30c1dd2dfe 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h @@ -21,6 +21,9 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size); +enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4c45146a1f0f3..30b1bf895720e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -7330,6 +7330,7 @@ static void ggml_compute_forward_group_norm( static void ggml_compute_forward_mul_mat_one_chunk( const struct ggml_compute_params * params, struct ggml_tensor * dst, + const enum ggml_type type, const int64_t num_rows_per_vec_dot, const int64_t ir0_start, const int64_t ir0_end, @@ -7341,8 +7342,6 @@ static void ggml_compute_forward_mul_mat_one_chunk( GGML_TENSOR_BINARY_OP_LOCALS - const enum ggml_type type = src0->type; - const bool src1_cont = ggml_is_contiguous(src1); ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; @@ -7430,7 +7429,11 @@ static void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; - const enum ggml_type type = src0->type; + enum ggml_type type = src0->type; + + if (src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { + type = (enum ggml_type)(intptr_t)src0->extra; + } enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; @@ -7469,15 +7472,15 @@ static void ggml_compute_forward_mul_mat( if (src1_cont) { for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(src0->type), + nb01/ggml_type_size(type), (const char *)src1->data + i12*nb12 + i13*nb13, nb11/ggml_type_size(src1->type), (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), ith, nth, - src0->type, + type, src1->type, dst->type)) goto UseGgmlGemm1; @@ -7530,15 +7533,15 @@ UseGgmlGemm1:; for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(src0->type), + nb01/ggml_type_size(type), (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, row_size/ggml_type_size(vec_dot_type), (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), ith, nth, - src0->type, + type, vec_dot_type, dst->type)) goto UseGgmlGemm2; @@ -7623,7 +7626,7 @@ UseGgmlGemm2:; const int64_t ir1_start = dr1 * ith1; const int64_t ir1_end = MIN(ir1_start + dr1, nr1); - ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); + ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); if (nth >= nchunk0 * nchunk1) { break; diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index c7216117bc805..573b7c5b9b375 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -1,6 +1,7 @@ #include "ggml-backend.h" #include "ggml-backend-impl.h" #include "ggml-cpu.h" +#include "ggml-cpu-aarch64.h" #include "ggml-impl.h" #include #include @@ -69,15 +70,84 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { } #endif +// buffer type AARCH64 + +static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT + + GGML_UNUSED(buffer); +} + +static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra; + + ggml_aarch64_repack_tensor(tensor, repack_type, data, size); + + GGML_UNUSED(buffer); +} + +static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_AARCH64"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + + if (buffer == NULL) { + return NULL; + } + + buffer->buft = buft; + buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor; + buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ NULL, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_buffer_type_aarch64; +} + +bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) { + return buft == ggml_backend_cpu_aarch64_buffer_type(); +} + static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) { - static ggml_backend_buffer_type_t bufts[] = { + static std::vector bufts = []() { + std::vector bufts; + #ifdef GGML_USE_CPU_HBM - ggml_backend_cpu_hbm_buffer_type(), + bufts.push_back(ggml_backend_cpu_hbm_buffer_type()); +#endif + +#ifdef GGML_USE_CPU_AARCH64 + bufts.push_back(ggml_backend_cpu_aarch64_buffer_type()); #endif - NULL - }; - return bufts; + bufts.push_back(NULL); + + return bufts; + }(); + + return bufts.data(); GGML_UNUSED(device); } @@ -383,6 +453,21 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b } static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { + if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) { + return false; + } + } + + for (int i = 1; i < GGML_MAX_SRC; i++) { + if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) { + return false; + } + } + switch (op->op) { case GGML_OP_CPY: return @@ -391,13 +476,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st op->type != GGML_TYPE_IQ1_S && op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: - return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type; + return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type; case GGML_OP_ROPE_BACK: return op->src[2] == NULL && (op->op_params[2] & 4) == 0; case GGML_OP_IM2COL_BACK: - return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32; case GGML_OP_OUT_PROD: - return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32; + return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32; default: return true; } @@ -406,7 +491,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st } static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); + return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft); GGML_UNUSED(dev); } @@ -566,6 +651,9 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { }; ggml_backend_reg_t ggml_backend_cpu_reg(void) { + // init CPU feature detection + ggml_cpu_init(); + static struct ggml_backend_reg ggml_backend_cpu_reg = { /* .iface = */ ggml_backend_cpu_reg_i, /* .context = */ NULL, diff --git a/src/llama.cpp b/src/llama.cpp index 6ec419e9b2e3d..7a9a0e3add3d6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7254,7 +7254,7 @@ static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) { auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) - ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_cpu_get_extra_bufts"); + ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); if (ggml_backend_dev_get_extra_bufts_fn) { ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); while (extra_bufts && *extra_bufts) { From 5a54af4d4f588f109f31e456483fdf77096399d9 Mon Sep 17 00:00:00 2001 From: Romain Biessy Date: Fri, 15 Nov 2024 04:09:12 +0100 Subject: [PATCH 02/46] sycl: Use syclcompat::dp4a (#10267) * sycl: Use syclcompat::dp4a * Using the syclcompat version allow the compiler to optimize the operation with native function * Update news section * Update CI Windows oneAPI version to 2025.0 * Reword doc * Call syclcompat::dp4a inside dpct::dp4a This reverts commit 90cb61d692d61360b46954a1c7f780bd2e569b73. --- .github/workflows/build.yml | 2 +- docs/backend/SYCL.md | 2 ++ ggml/src/ggml-sycl/dpct/helper.hpp | 24 ++---------------------- ggml/src/ggml-sycl/vecdotq.hpp | 8 ++++---- 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d6a7b66a511f8..c770bbd155c4c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -930,7 +930,7 @@ jobs: shell: bash env: - WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe + WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" steps: diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index bc8c0f88647c2..38185f73897ee 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -41,6 +41,8 @@ The following release is verified with good quality: ## News +- 2024.11 + - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer. - 2024.8 - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs. diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index fe4a8f744e2e0..c2f28bb49579e 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -1830,31 +1831,10 @@ namespace dpct : id); } - template - sycl::vec extract_and_sign_or_zero_extend4(T val) - { - return sycl::vec(val) - .template as, int8_t, uint8_t>, 4>>() - .template convert(); - } - - template - using dot_product_acc_t = - std::conditional_t && std::is_unsigned_v, - uint32_t, int32_t>; - template inline auto dp4a(T1 a, T2 b, T3 c) { - dot_product_acc_t res = c; - auto va = extract_and_sign_or_zero_extend4(a); - auto vb = extract_and_sign_or_zero_extend4(b); - res += va[0] * vb[0]; - res += va[1] * vb[1]; - res += va[2] * vb[2]; - res += va[3] * vb[3]; - return res; + return syclcompat::dp4a(a, b, c); } struct sub_sat diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index d2dccade20bfd..c5942008adfbd 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -968,8 +968,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, grid1[0] ^ signs[0], signs[0], std::minus<>()); const int grid_h = dpct::vectorized_binary( grid2[0] ^ signs[1], signs[1], std::minus<>()); - sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); - sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); + sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi); + sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi); q8 += 8; aux32 >>= 7; } @@ -1009,8 +1009,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, grid1[0] ^ signs0, signs0, std::minus<>()); const int grid_h = dpct::vectorized_binary( grid2[0] ^ signs1, signs1, std::minus<>()); - sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); - sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); + sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi); + sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi); q8 += 8; } const float d = From 4802ad350b8e19cbc7a77269b4494c896f6e0896 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 15 Nov 2024 08:38:43 +0200 Subject: [PATCH 03/46] scripts : fix regex in sync [no ci] --- scripts/sync-ggml-am.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 06a04745b16ab..74d6c6c8b47ce 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -144,17 +144,17 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \ -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \ -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\1.c/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\1.cpp/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\1.h/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\1.cu/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\1.m/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\2.cu/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\2.m/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \ -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\1.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \ -e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \ -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \ -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \ From 231f9360d94446cd083b6b116f63991b1328c484 Mon Sep 17 00:00:00 2001 From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com> Date: Fri, 15 Nov 2024 15:09:35 +0800 Subject: [PATCH 04/46] cann: dockerfile and doc adjustment (#10302) Co-authored-by: noemotiovon --- .devops/llama-cli-cann.Dockerfile | 4 ++-- docs/build.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index db5ba2f25ea67..45c0585b0b3d9 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -1,6 +1,6 @@ ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8 -FROM cosdt/cann:$ASCEND_VERSION AS build +FROM ascendai/cann:$ASCEND_VERSION AS build WORKDIR /app @@ -26,7 +26,7 @@ RUN echo "Building with static libs" && \ cmake --build build --config Release --target llama-cli # TODO: use image with NNRT -FROM cosdt/cann:$ASCEND_VERSION AS runtime +FROM ascendai/cann:$ASCEND_VERSION AS runtime COPY --from=build /app/build/bin/llama-cli /llama-cli ENV LC_ALL=C.utf8 diff --git a/docs/build.md b/docs/build.md index 95512415ab0b9..52de2b4e2c224 100644 --- a/docs/build.md +++ b/docs/build.md @@ -375,7 +375,7 @@ cmake --build build --config release You can test with: -`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32` +`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32` If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`: ```bash From 9901068ac78838745e604fffb4601d315a610456 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 15 Nov 2024 05:48:49 -0400 Subject: [PATCH 05/46] server : (web UI) add copy button for code block, fix api key (#10242) * server : (web ui) add copy btn for code blocks * fix problem with api key * use settings-modal-short-input component * always show copy btn for code snippet --- examples/server/public/index.html | 62 +++++++++++++++++++++---------- examples/server/server.cpp | 42 +++++++++++++-------- 2 files changed, 68 insertions(+), 36 deletions(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 55639a9448e71..65a915d59bfd5 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -12,7 +12,7 @@ .markdown { h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; } pre { - @apply whitespace-pre-wrap my-4 rounded-lg p-2; + @apply whitespace-pre-wrap rounded-lg p-2; border: 1px solid currentColor; } /* TODO: fix markdown table */ @@ -25,8 +25,11 @@ .bg-base-200 {background-color: var(--fallback-b2,oklch(var(--b2)/1))} .bg-base-300 {background-color: var(--fallback-b3,oklch(var(--b3)/1))} .text-base-content {color: var(--fallback-bc,oklch(var(--bc)/1))} + .show-on-hover { + @apply opacity-0 group-hover:opacity-100; + } .btn-mini { - @apply cursor-pointer opacity-0 group-hover:opacity-100 hover:shadow-md; + @apply cursor-pointer hover:shadow-md; } .chat-screen { max-width: 900px; } /* because the default bubble color is quite dark, we will make a custom one using bg-base-300 */ @@ -152,14 +155,14 @@

Conversations

- - -
@@ -196,12 +199,13 @@

Conversations

Settings

Settings below are saved in browser's localStorage

+ @@ -209,7 +213,7 @@

Settings

Other sampler settings
@@ -218,7 +222,7 @@

Settings

Penalties settings
@@ -245,7 +249,7 @@

Settings

-