From 3ad5451f3b75809e3033e4e577b9f60bcaf6676a Mon Sep 17 00:00:00 2001 From: uvos Date: Wed, 27 Nov 2024 17:10:08 +0100 Subject: [PATCH 01/19] Add some minimal optimizations for CDNA (#10498) * Add some minimal optimizations for CDNA * ggml_cuda: set launch bounds also for GCN as it helps there too --- ggml/src/ggml-cuda/common.cuh | 17 ++++++++++++++--- ggml/src/ggml-cuda/ggml-cuda.cu | 11 ++++++++++- ggml/src/ggml-cuda/mmq.cu | 2 +- ggml/src/ggml-cuda/mmq.cuh | 4 ++-- ggml/src/ggml-cuda/mmvq.cu | 2 +- ggml/src/ggml-cuda/vendors/hip.h | 8 ++++++++ 6 files changed, 36 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index b0dd16066b4ba..535118d87928e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -47,9 +47,20 @@ #define CC_TURING 750 #define CC_AMPERE 800 #define CC_OFFSET_AMD 1000000 -#define CC_RDNA1 (CC_OFFSET_AMD + 1010) -#define CC_RDNA2 (CC_OFFSET_AMD + 1030) -#define CC_RDNA3 (CC_OFFSET_AMD + 1100) + +// GCN/CNDA, wave size is 64 +#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 +#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue +#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a +#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers +#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing +#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300 + +// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 +#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000 +#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a +#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA + #define CC_QY1 210 #define CC_QY2 220 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 2a78a4393d0f7..d6e4bfdd0d437 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1107,6 +1107,11 @@ static void ggml_cuda_op_mul_mat_cublas( const half alpha_f16 = 1.0f; const half beta_f16 = 0.0f; + cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; + if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { + cu_compute_type = CUBLAS_COMPUTE_32F; + } + CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); CUBLAS_CHECK( cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, @@ -1114,7 +1119,7 @@ static void ggml_cuda_op_mul_mat_cublas( &alpha_f16, src0_ptr, CUDA_R_16F, ne00, src1_ptr, CUDA_R_16F, ne10, &beta_f16, dst_f16.get(), CUDA_R_16F, ldc, - CUBLAS_COMPUTE_16F, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); @@ -1607,6 +1612,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; cudaDataType_t cu_data_type = CUDA_R_16F; + if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { + cu_compute_type = CUBLAS_COMPUTE_32F; + } + // dst strides size_t nbd2 = dst->nb[2]; size_t nbd3 = dst->nb[3]; diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index ae5c68ab35129..7f7c8c90b6fe2 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } - return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; + return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 425acb20da311..8d8867121f321 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -2570,9 +2570,9 @@ static __device__ void mul_mat_q_process_tile( template #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) +#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) __launch_bounds__(WARP_SIZE*nwarps, 2) -#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) #else #if __CUDA_ARCH__ >= CC_VOLTA __launch_bounds__(WARP_SIZE*nwarps, 1) diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 735975c160dd0..02d1509836c40 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda( int64_t nwarps = 1; int64_t rows_per_cuda_block = 1; - if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2 + if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA switch(ncols_y) { case 1: nwarps = 4; diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 1f3c70c2e6934..3205534d66f10 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -95,6 +95,14 @@ #define __CUDA_ARCH__ 1300 +#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) +#define GCN +#endif + +#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) +#define CDNA +#endif + #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ defined(__gfx1150__) || defined(__gfx1151__) #define RDNA3 From 9f912511bc9414fa7a3c521378b6388cd932b58d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 27 Nov 2024 22:30:52 +0100 Subject: [PATCH 02/19] common : fix duplicated file name with hf_repo and hf_file (#10550) --- common/arg.cpp | 6 +++- common/common.cpp | 54 +++++++++++++++++----------------- common/common.h | 13 ++++++-- examples/server/tests/utils.py | 1 - 4 files changed, 43 insertions(+), 31 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 272492e50df15..a6b7a1394f735 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) { } params.hf_file = params.model; } else if (params.model.empty()) { - params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); + // this is to avoid different repo having same file name, or same file name in different subdirs + std::string filename = params.hf_repo + "_" + params.hf_file; + // to make sure we don't have any slashes in the filename + string_replace_all(filename, "/", "_"); + params.model = fs_get_cache_file(filename); } } else if (!params.model_url.empty()) { if (params.model.empty()) { diff --git a/common/common.cpp b/common/common.cpp index 09ec9f2388afb..2b2f0009897f3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) { llama_model * model = nullptr; if (!params.hf_repo.empty() && !params.hf_file.empty()) { - model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams); } else if (!params.model_url.empty()) { - model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams); } else { model = llama_load_model_from_file(params.model.c_str(), mparams); } @@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa } struct llama_model * common_load_model_from_url( - const char * model_url, - const char * path_model, - const char * hf_token, + const std::string & model_url, + const std::string & local_path, + const std::string & hf_token, const struct llama_model_params & params) { // Basic validation of the model_url - if (!model_url || strlen(model_url) == 0) { + if (model_url.empty()) { LOG_ERR("%s: invalid model_url\n", __func__); return NULL; } - if (!common_download_file(model_url, path_model, hf_token)) { + if (!common_download_file(model_url, local_path, hf_token)) { return NULL; } @@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url( /*.no_alloc = */ true, /*.ctx = */ NULL, }; - auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); + auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params); if (!ctx_gguf) { - LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model); + LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str()); return NULL; } @@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url( // Verify the first split file format // and extract split URL and PATH prefixes { - if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { - LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split); + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) { + LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split); return NULL; } - if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { - LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split); + if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) { + LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split); return NULL; } } @@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url( } } - return llama_load_model_from_file(path_model, params); + return llama_load_model_from_file(local_path.c_str(), params); } struct llama_model * common_load_model_from_hf( - const char * repo, - const char * model, - const char * path_model, - const char * hf_token, + const std::string & repo, + const std::string & remote_path, + const std::string & local_path, + const std::string & hf_token, const struct llama_model_params & params) { // construct hugging face model url: // @@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf( std::string model_url = "https://huggingface.co/"; model_url += repo; model_url += "/resolve/main/"; - model_url += model; + model_url += remote_path; - return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params); + return common_load_model_from_url(model_url, local_path, hf_token, params); } #else struct llama_model * common_load_model_from_url( - const char * /*model_url*/, - const char * /*path_model*/, - const char * /*hf_token*/, + const std::string & /*model_url*/, + const std::string & /*local_path*/, + const std::string & /*hf_token*/, const struct llama_model_params & /*params*/) { LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); return nullptr; } struct llama_model * common_load_model_from_hf( - const char * /*repo*/, - const char * /*model*/, - const char * /*path_model*/, - const char * /*hf_token*/, + const std::string & /*repo*/, + const std::string & /*remote_path*/, + const std::string & /*local_path*/, + const std::string & /*hf_token*/, const struct llama_model_params & /*params*/) { LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); return nullptr; diff --git a/common/common.h b/common/common.h index 286642db24158..9b1508a15fb43 100644 --- a/common/common.h +++ b/common/common.h @@ -470,8 +470,17 @@ struct llama_model_params common_model_params_to_llama ( common_params struct llama_context_params common_context_params_to_llama(const common_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); -struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); -struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct llama_model * common_load_model_from_url( + const std::string & model_url, + const std::string & local_path, + const std::string & hf_token, + const struct llama_model_params & params); +struct llama_model * common_load_model_from_hf( + const std::string & repo, + const std::string & remote_path, + const std::string & local_path, + const std::string & hf_token, + const struct llama_model_params & params); // clear LoRA adapters from context, then apply new list of adapters void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index bc590bcb31547..e31743c505d8e 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -319,7 +319,6 @@ def jina_reranker_tiny() -> ServerProcess: server.model_hf_repo = "ggml-org/models" server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf" server.model_alias = "jina-reranker" - server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf" server.n_ctx = 512 server.n_batch = 512 server.n_slots = 1 From b7420131bf8ab3e067bc660439ab1ab18be7edbd Mon Sep 17 00:00:00 2001 From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:24:46 +0800 Subject: [PATCH 03/19] CANN: ROPE operator optimization (#10540) * [cann] ROPE operator optimization Co-authored-by: noemotiovon --- ggml/src/ggml-cann/aclnn_ops.cpp | 308 +++++++++++++++++++++---------- ggml/src/ggml-cann/ggml-cann.cpp | 9 - 2 files changed, 211 insertions(+), 106 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index d7472ee3a55c7..d707efc5d1f48 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -21,22 +21,23 @@ */ #include "aclnn_ops.h" -#include "ggml-impl.h" +#include #include +#include #include #include #include #include +#include #include #include #include #include #include -#include -#include #include #include +#include #include #include #include @@ -56,6 +57,7 @@ #include #include +#include "ggml-impl.h" #include "kernels/ascendc_kernels.h" #define GGML_COMMON_DECL_C @@ -1103,9 +1105,9 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, } /** - * @brief Creates an ACL tensor initialized with ones using a provided buffer. + * @brief Creates an ACL tensor initialized with value using a provided buffer. * - * This function initializes a tensor with ones using the specified buffer and + * This function initializes a tensor with value using the specified buffer and * tensor parameters. * * @param ctx The context for the CANN backend operations. @@ -1118,12 +1120,12 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, * @param type_size The size of each element in the tensor data type. * @param value The value to be used for initializing the tensor (default * is 1.0). - * @return An ACL tensor initialized with ones. + * @return An ACL tensor initialized with value. */ -static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer, - size_t n_bytes, int64_t* ne, int64_t dims, - aclDataType type, size_t type_size, - float value = 1.0f) { +static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer, + size_t n_bytes, int64_t* ne, int64_t dims, + aclDataType type, size_t type_size, + float value = 1.0f) { aclTensor* acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size); float alpha_host = 1.0f; @@ -1165,7 +1167,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src); ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); - aclTensor* acl_gamma = aclnn_ones( + aclTensor* acl_gamma = aclnn_values( ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, ggml_cann_type_mapping(src->type), ggml_element_size(src)); @@ -1209,9 +1211,9 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); aclTensor* mask_tensor = - aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, - GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), - ggml_element_size(src), value); + aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, + src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), + ggml_element_size(src), value); uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -1768,6 +1770,92 @@ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream())); } +/** + * @brief Performs element-wise division of tensor1 by tensor2 , multiplies the + result by the scalar value and adds it to self . + * + * Performs element-wise division of tensor1 by tensor2, + * multiplies the result by the scalar value and adds it to self . + * The operation is defined as: + * \f[ + * \text{out}_i = \text{selft}_i + \text{value} \times + \frac{\text{tensor1}_i}{\text{tensor2}_i} + * \f] + + * @param ctx The context for the CANN backend operations. + * @param acl_self The source tensor on which the addcdiv function will be + applied. + * @param tensor1 Numerator tensor. + * @param tensor2 Denominator tensor. + * @param value The value to be used for coefficient. + */ +static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx, + aclTensor* acl_self, aclTensor* tensor1, + aclTensor* tensor2, float value) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); + + ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize( + acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor, + ctx.stream())); +} + +/** + * @brief Matrix division, optionally in-place. + * + * This function division each element of the source tensor `acl_src` by the + * tensor `acl_other` and stores the result in the destination tensor `acl_dst`. + * If `inplace` is true, `acl_dst` will not be used and the operation is + * performed in-place on `acl_src`. The operation is defined as: \f[ + * \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_src Numerator tensor.. + * @param acl_other Denominator tensor. + * @param acl_dst The destination tensor where the result will be stored if + * `inplace` is false. + * @param inplace Flag indicating whether to perform the operation in-place on + * `acl_src`. + */ +static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_other, aclTensor* acl_dst, + bool inplace) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + if (inplace) { + ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor, + ctx.stream())); + } else { + ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream())); + } +} + void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const ggml_tensor* src = dst->src[0]; @@ -2311,12 +2399,13 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ctx.stream())); switch (src0->type) { - case GGML_TYPE_F32: - { + case GGML_TYPE_F32: { #ifdef ASCEND_310P - // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + // Special operation for get_row_f32 kernel of 310P: clear the + // content of dest data buffer when row is not aligned to 32 bytes if ((src0->ne[0] % 8) != 0) { - size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * + src0->ne[0] * ggml_type_size(GGML_TYPE_F32); ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); } #endif @@ -2329,12 +2418,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); break; } - case GGML_TYPE_F16: - { + case GGML_TYPE_F16: { #ifdef ASCEND_310P - // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + // Special operation for get_row_f16 kernel of 310P: clear the + // content of dest data buffer when row is not aligned to 32 bytes if ((src0->ne[0] % 16) != 0) { - size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16 + size_t dst_len = + src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * + ggml_type_size( + GGML_TYPE_F32); // out is also f32, even input is f16 ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); } #endif @@ -2459,8 +2551,9 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, * @param acl_dst The destination tensor where the result of the matrix * multiplication will be stored. */ -static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input, - aclTensor* acl_weight, aclTensor* acl_dst) { +static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, + aclTensor* acl_input, aclTensor* acl_weight, + aclTensor* acl_dst) { int8_t cube_math_type = 2; uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -2475,8 +2568,7 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu workspaceAddr = workspace_allocator.get(); } - ACL_CHECK( - aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream())); + ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream())); } /** @@ -2496,8 +2588,9 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu * @param acl_dst The destination tensor where the result of the matrix * multiplication will be stored. */ -static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input, - aclTensor* acl_weight, aclTensor* acl_dst) { +static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, + aclTensor* acl_input, aclTensor* acl_weight, + aclTensor* acl_dst) { int8_t cube_math_type = 2; uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -2548,31 +2641,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, aclTensor* acl_input_tensor = ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims); - int64_t transpose_ne[] = { - bcast_weight_ne[1], bcast_weight_ne[0], - bcast_weight_ne[2], bcast_weight_ne[3], - bcast_weight_ne[4], bcast_weight_ne[5] - }; - size_t transpose_nb[] = { - bcast_weight_nb[1], bcast_weight_nb[0], - bcast_weight_nb[2], bcast_weight_nb[3], - bcast_weight_nb[4], bcast_weight_nb[5] - }; + int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], + bcast_weight_ne[2], bcast_weight_ne[3], + bcast_weight_ne[4], bcast_weight_ne[5]}; + size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], + bcast_weight_nb[2], bcast_weight_nb[3], + bcast_weight_nb[4], bcast_weight_nb[5]}; aclTensor* acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims); aclTensor* acl_dst = ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); switch (n_dims) { - case 2: - aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); - break; - case 3: - aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); - break; - default: - aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); - break; + case 2: + aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + case 3: + aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + default: + aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; } ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); @@ -2594,8 +2683,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, * multiplication will be stored. */ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, - ggml_tensor* dst, - const enum ggml_type type) { + ggml_tensor* dst, + const enum ggml_type type) { ggml_tensor* src0 = dst->src[0]; // weight ggml_tensor* src1 = dst->src[1]; // input @@ -2617,14 +2706,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, // scale stored at the end of weight. Also need transpose. size_t scale_elem_size = sizeof(uint16_t); - size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size}; + size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, + scale_elem_size}; size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; char* scale_offset = (char*)src0->data + weight_size; // input size_t input_elem_size = sizeof(uint16_t); int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; - size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size}; + size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size}; size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size; ggml_cann_pool_alloc input_alloctor(ctx.pool()); void* input_buffer = src1->data; @@ -2632,7 +2722,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, // case in if (src1->type != GGML_TYPE_F16) { aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); - input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); + input_buffer = + input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); int64_t* input_cast_ne = src1->ne; size_t input_cast_nb[GGML_MAX_DIMS]; @@ -2642,9 +2733,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, } aclTensor* acl_input_tensor = ggml_cann_create_tensor( - input_buffer, - ACL_FLOAT16, - input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS); + input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, + input_cast_nb, GGML_MAX_DIMS); aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); ACL_CHECK(aclDestroyTensor(acl_input_tensor)); @@ -2655,7 +2745,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, size_t output_elem_size = sizeof(uint16_t); size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size}; ggml_cann_pool_alloc output_allocator(ctx.pool()); - void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size); + void* output_buffer = + output_allocator.alloc(ggml_nelements(dst) * output_elem_size); size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size; // aclnn @@ -2679,7 +2770,9 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, // first split int64_t weight_ne_offset = 0; - int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]}; + int64_t weight_ne[2] = { + max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, + src0->ne[0]}; int64_t scale_ne_offset = 0; int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0}; int64_t output_ne_offset = 0; @@ -2687,24 +2780,21 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, aclTensor* acl_weight_tensor = ggml_cann_create_tensor( (char*)src0->data + batch0 * weight_stride, - ggml_cann_type_mapping(type), - weight_elem_size, weight_ne, weight_nb, 2, - ACL_FORMAT_ND, weight_ne_offset); + ggml_cann_type_mapping(type), weight_elem_size, weight_ne, + weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); aclTensor* acl_scale_tensor = ggml_cann_create_tensor( - scale_offset + batch0 * scale_stride, - ACL_FLOAT16, - scale_elem_size, scale_ne, scale_nb, 2, - ACL_FORMAT_ND, scale_ne_offset); + scale_offset + batch0 * scale_stride, ACL_FLOAT16, + scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, + scale_ne_offset); aclTensor* acl_output_tensor = ggml_cann_create_tensor( - (char*)output_buffer + batch1 * output_stride, - ACL_FLOAT16, - output_elem_size, output_ne, output_nb, 2, - ACL_FORMAT_ND, output_ne_offset); + (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, + output_ne_offset); ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( - acl_input_tensor, acl_weight_tensor, acl_scale_tensor, - nullptr, nullptr, nullptr, nullptr, QK8_0, - acl_output_tensor, &workspaceSize, &executor)); + acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, + nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, + &workspaceSize, &executor)); if (workspaceAddr == nullptr) { workspaceAddr = workspace_allocator.alloc(workspaceSize); } @@ -2717,28 +2807,29 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, // other splits for (int64_t split = 1; split < split_size; split++) { - weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1]; - weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size; + weight_ne_offset += + weight_elem_size * weight_ne[0] * weight_ne[1]; + weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] + ? src0->ne[1] - (max_elem_size * split) + : max_elem_size; scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1]; scale_ne[0] = weight_ne[0]; - output_ne_offset += output_elem_size * output_ne[0] * output_ne[1]; + output_ne_offset += + output_elem_size * output_ne[0] * output_ne[1]; output_ne[0] = weight_ne[0]; acl_weight_tensor = ggml_cann_create_tensor( (char*)src0->data + batch0 * weight_stride, - ggml_cann_type_mapping(type), - weight_elem_size, weight_ne, weight_nb, 2, - ACL_FORMAT_ND, weight_ne_offset); + ggml_cann_type_mapping(type), weight_elem_size, weight_ne, + weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); acl_scale_tensor = ggml_cann_create_tensor( - scale_offset + batch0 * scale_stride, - ACL_FLOAT16, - scale_elem_size, scale_ne, scale_nb, 2, - ACL_FORMAT_ND, scale_ne_offset); + scale_offset + batch0 * scale_stride, ACL_FLOAT16, + scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, + scale_ne_offset); acl_output_tensor = ggml_cann_create_tensor( - (char*)output_buffer + batch1 * output_stride, - ACL_FLOAT16, - output_elem_size, output_ne, output_nb, 2, - ACL_FORMAT_ND, output_ne_offset); + (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, + output_ne_offset); ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( acl_input_tensor, acl_weight_tensor, acl_scale_tensor, @@ -2766,11 +2857,11 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, } aclTensor* acl_output_tensor = ggml_cann_create_tensor( - output_buffer, - ACL_FLOAT16, - output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS); + output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne, + output_cast_nb, GGML_MAX_DIMS); aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); - aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type)); + aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, + ggml_cann_type_mapping(dst->type)); ACL_CHECK(aclDestroyTensor(acl_output_tensor)); ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); @@ -2873,12 +2964,14 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_cos_repeat_tensor, aclTensor* acl_sin_repeat_tensor, - float theta_scale, bool is_neox) { + float theta_scale, float freq_scale, + bool is_neox) { // int sin/cos cache, cache has different repeat method depond on // @param.is_neox ggml_tensor* src0 = dst->src[0]; // input ggml_tensor* src1 = dst->src[1]; // position + ggml_tensor* src2 = dst->src[2]; // freq_factors // arange, [0,1,...,ne0/2] int64_t arange_length = src0->ne[0] / 2; @@ -2907,11 +3000,25 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(), arange_length * sizeof(float_t)); void* theta_scale_buffer = theta_scale_allocator.get(); - aclTensor* acl_theta_scale_tensor = aclnn_ones( + aclTensor* acl_theta_scale_tensor = aclnn_values( ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale); aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor); + // freq_scale + if (freq_scale != 1) { + aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); + } + + // freq_factors + if (src2) { + aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( + src2->data, ggml_cann_type_mapping(src2->type), + ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS); + aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, + nullptr, true); + } + // position GGML_ASSERT(src1->type == GGML_TYPE_I32); int64_t position_length = src1->ne[0]; @@ -2940,6 +3047,16 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, acl_theta_tensor); + // // power[] * position[] * freq_scale / freq_factors[] + // ggml_cann_pool_alloc theta_final_allocator(ctx.pool(), + // theta_length * + // sizeof(float_t)); + // aclTensor* acl_theat_final_tensor = aclnn_zero( + // ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length, + // theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t)); + // aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor, + // acl_freq_factors_tensor, freq_scale); + // permute: [0,1,2,3]->[0,2,1,3] int64_t permute_ne[] = {arange_length, 1, position_length, 1}; size_t permute_nb[GGML_MAX_DIMS]; @@ -3038,8 +3155,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); - // TODO: with freq_factors - GGML_ASSERT(src2 == NULL); // TODO: attn_factor != 1 GGML_ASSERT(attn_factor == 1); // TODO: n_dims <= ne0 @@ -3047,8 +3162,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(n_dims % 2 == 0); // TODO: ext_factor != 0 GGML_ASSERT(ext_factor == 0); - // TODO: freq_scale != 1 - GGML_ASSERT(freq_scale == 1); // TODO: type == GGML_TYPE_F16 GGML_ASSERT(src0->type == GGML_TYPE_F32); @@ -3081,7 +3194,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor, - theta_scale, is_neox); + theta_scale, freq_scale, is_neox); uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -3096,7 +3209,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_x = ggml_cann_create_tensor(src0); aclTensor* acl_dst = ggml_cann_create_tensor(dst); ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize( - acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor)); + acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, + acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); workspaceAddr = workspace_allocator.get(); diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index c7a3419c796de..bcb54e44404db 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1738,13 +1738,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, } case GGML_OP_ROPE: { // TODO: with ops-test v == 1 - float * freq_scale = (float*)((int32_t*)op->op_params + 6); float * ext_factor = (float*)((int32_t*)op->op_params + 7); float * attn_factor = (float*)((int32_t*)op->op_params + 8); - // TODO: with freq_factors - if (op->src[2] != NULL) { - return false; - } // TODO: n_dims <= ne0 if (op->src[0]->ne[0] != op->op_params[1]) { return false; @@ -1753,10 +1748,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, if (*ext_factor != 0) { return false; } - // TODO: freq_scale != 1 - if (*freq_scale != 1) { - return false; - } // TODO: attn_factor != 1 if (*attn_factor != 1) { return false; From 605fa66c509f9f117bd654cf0b9b3ea08bb86e80 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Thu, 28 Nov 2024 15:25:24 +0800 Subject: [PATCH 04/19] CANN: Fix SOC_TYPE compile bug (#10519) * CANN: Fix the bug build fail on Ascend310P under two cases: 1) Manual specify SOC_TYPE 2) Under some unusual compile environment * Update the cann backend News content: Support F16 and F32 data type model for Ascend 310P NPU. * fix CANN compile fail bug: the assert in ascend kernel function doesn't supportted on some CANN version --- docs/backend/CANN.md | 3 +++ ggml/src/ggml-cann/CMakeLists.txt | 7 ++++--- ggml/src/ggml-cann/kernels/CMakeLists.txt | 2 +- ggml/src/ggml-cann/kernels/dup.cpp | 1 - ggml/src/ggml-cann/kernels/get_row_q4_0.cpp | 16 ++++++++++++---- .../src/ggml-cann/kernels/quantize_f16_q8_0.cpp | 10 ++++++++++ .../src/ggml-cann/kernels/quantize_f32_q8_0.cpp | 10 ++++++++++ .../kernels/quantize_float_to_q4_0.cpp | 17 +++++++++++++++++ 8 files changed, 57 insertions(+), 9 deletions(-) diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 6bdd9d2daab90..496e058074614 100644 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi ## News +- 2024.11 + - Support F16 and F32 data type model for Ascend 310P NPU. - 2024.8 - Support `Q4_0` and `Q8_0` data type for Ascend NPU. - 2024.7 @@ -43,6 +45,7 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi | Ascend NPU | Status | |:-----------------------------:|:-------:| | Atlas 300T A2 | Support | +| Atlas 300I Duo | Support | *Notes:* diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index 901327185fb75..05cf06bfab4fc 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -22,13 +22,14 @@ if(NOT SOC_TYPE) detect_ascend_soc_type(SOC_VERSION) set(SOC_TYPE "${SOC_VERSION}") message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}") -else() - string(TOLOWER ${SOC_TYPE} SOC_VERSION) endif() -# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P. +string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower + +# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P. string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}") set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}") +string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION) if (CANN_INSTALL_DIR) # Only Support Linux. diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt index 6a4e17cce54c9..d687220c3c57e 100644 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt @@ -25,6 +25,6 @@ ascendc_library(ascendc_kernels STATIC ${SRC_FILES} ) -message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.") +message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.") ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp index 99f03e05883aa..c7ba38d10a0b2 100644 --- a/ggml/src/ggml-cann/kernels/dup.cpp +++ b/ggml/src/ggml-cann/kernels/dup.cpp @@ -20,7 +20,6 @@ class DupByRows { // Input has four dims. int64_t op_block_num = GetBlockNum(); int64_t op_block_idx = GetBlockIdx(); - assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM); // param num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp index 377211096e1f5..4fbe722086cf0 100644 --- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp @@ -2,6 +2,15 @@ // optimize me. Use template to avoid copy code. using namespace AscendC; +#ifdef ASCEND_310P // 310P not support 4bit get row + extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, + GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support 4bit get row.\n"); + } +#else #define BUFFER_NUM 2 @@ -110,12 +119,9 @@ class GET_ROW_Q4_0 { LocalTensor output_local = output_queue.AllocTensor(); // TODO: cast more data to speed up. -#ifdef ASCEND_310P - // TODO: 310P support quantification -#else Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); -#endif + // Only mul need compile by group. half scale = scale_gm.GetValue(scale_offset); @@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( indices_nb_ub, output_ne_ub, output_nb_ub); op.calculate(); } + +#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp index 8423b3f02a8f8..504b43afaa1f4 100644 --- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp @@ -1,6 +1,14 @@ #include "kernel_operator.h" using namespace AscendC; +#ifdef ASCEND_310P + extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support f16->8bit quantization.\n"); + } +#else #define BUFFER_NUM 2 #define QK8_0 32 @@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); op.calculate(); } + +#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp index b7c575093e9c1..05b0bc1df59af 100644 --- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp @@ -1,6 +1,14 @@ #include "kernel_operator.h" using namespace AscendC; +#ifdef ASCEND_310P // 310P not support f32->8bit quantization + extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support f32->8bit quantization.\n"); + } +#else #define BUFFER_NUM 2 #define QK8_0 32 @@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); op.calculate(); } + +#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp index 9c8c86b66ad66..1188937b74461 100644 --- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp @@ -1,6 +1,21 @@ #include "kernel_operator.h" using namespace AscendC; +#ifdef ASCEND_310P // 310P not support float->4bit quantization + extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support f32->4bit quantization.\n"); + } + + extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support f16->4bit quantization.\n"); + } +#else #define BUFFER_NUM 2 #define Group_Size 32 @@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); op.calculate(); } + +#endif // #ifdef ASCEND_310P From c6bc73951ed52466392b1abda98c28ecbe522c7f Mon Sep 17 00:00:00 2001 From: Ruixin Huang <18860020911@163.com> Date: Thu, 28 Nov 2024 15:27:11 +0800 Subject: [PATCH 05/19] CANN: Update cann.md to display correctly in CLion (#10538) --- docs/backend/CANN.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 496e058074614..23f10175a6b2d 100644 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -42,6 +42,7 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi ### Ascend NPU **Verified devices** + | Ascend NPU | Status | |:-----------------------------:|:-------:| | Atlas 300T A2 | Support | From 2025fa67e94358deda4740a74fe9803916cb2f60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20L=C3=B3pez?= Date: Thu, 28 Nov 2024 12:51:38 +0100 Subject: [PATCH 06/19] kompute : improve backend to pass test_backend_ops (#10542) * kompute: op_unary: reject unsupported parameters Signed-off-by: Sergio Lopez * kompute: softmax: implement ALiBi support Signed-off-by: Sergio Lopez * kompute: rope: implement neox and phi3 support Signed-off-by: Sergio Lopez * kompute: op_mul_mat_q4_k permutted support Signed-off-by: Sergio Lopez * kompute: op_mul_mat_[q4_0|q4_1|q8_0] permutted support Signed-off-by: Sergio Lopez * kompute: op_mul_mat_f16 permutted support Signed-off-by: Sergio Lopez * kompute: op_mul_mat_q6_k permutted support Signed-off-by: Sergio Lopez --------- Signed-off-by: Sergio Lopez --- ggml/src/ggml-kompute/CMakeLists.txt | 12 +- ggml/src/ggml-kompute/ggml-kompute.cpp | 176 ++++++++++++------ .../ggml-kompute/kompute-shaders/common.comp | 1 + .../kompute-shaders/op_mul_mat_f16.comp | 6 +- .../kompute-shaders/op_mul_mat_q4_k.comp | 19 +- .../kompute-shaders/op_mul_mat_q6_k.comp | 24 ++- .../kompute-shaders/op_mul_mv_q_n.comp | 14 +- .../kompute-shaders/op_mul_mv_q_n_pre.comp | 8 +- .../kompute-shaders/op_rope_f16.comp | 73 -------- .../kompute-shaders/op_rope_f32.comp | 73 -------- .../kompute-shaders/op_rope_neox_f16.comp | 52 ++++++ .../kompute-shaders/op_rope_neox_f32.comp | 52 ++++++ .../kompute-shaders/op_rope_norm_f16.comp | 52 ++++++ .../kompute-shaders/op_rope_norm_f32.comp | 52 ++++++ .../kompute-shaders/op_softmax.comp | 20 +- .../kompute-shaders/rope_common.comp | 2 + 16 files changed, 403 insertions(+), 233 deletions(-) delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp create mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt index dc623926c7685..c9109d5e8ee19 100644 --- a/ggml/src/ggml-kompute/CMakeLists.txt +++ b/ggml/src/ggml-kompute/CMakeLists.txt @@ -105,8 +105,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") kompute-shaders/op_getrows_q4_0.comp kompute-shaders/op_getrows_q4_1.comp kompute-shaders/op_getrows_q6_k.comp - kompute-shaders/op_rope_f16.comp - kompute-shaders/op_rope_f32.comp + kompute-shaders/op_rope_norm_f16.comp + kompute-shaders/op_rope_norm_f32.comp + kompute-shaders/op_rope_neox_f16.comp + kompute-shaders/op_rope_neox_f32.comp kompute-shaders/op_cpy_f16_f16.comp kompute-shaders/op_cpy_f16_f32.comp kompute-shaders/op_cpy_f32_f16.comp @@ -139,8 +141,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") shaderop_getrows_q4_0.h shaderop_getrows_q4_1.h shaderop_getrows_q6_k.h - shaderop_rope_f16.h - shaderop_rope_f32.h + shaderop_rope_norm_f16.h + shaderop_rope_norm_f32.h + shaderop_rope_neox_f16.h + shaderop_rope_neox_f32.h shaderop_cpy_f16_f16.h shaderop_cpy_f16_f32.h shaderop_cpy_f32_f16.h diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp index 24566404ded0f..28ceecfc40d66 100644 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute/ggml-kompute.cpp @@ -28,8 +28,10 @@ #include "shaderop_getrows_q4_0.h" #include "shaderop_getrows_q4_1.h" #include "shaderop_getrows_q6_k.h" -#include "shaderop_rope_f16.h" -#include "shaderop_rope_f32.h" +#include "shaderop_rope_norm_f16.h" +#include "shaderop_rope_norm_f32.h" +#include "shaderop_rope_neox_f16.h" +#include "shaderop_rope_neox_f32.h" #include "shaderop_cpy_f16_f16.h" #include "shaderop_cpy_f16_f32.h" #include "shaderop_cpy_f32_f16.h" @@ -345,7 +347,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t std::vector descriptorPoolSizes = { vk::DescriptorPoolSize( vk::DescriptorType::eStorageBuffer, - 3 * size // Descriptor count is number of possible tensors to pass into an algorithm + 4 * size // Descriptor count is number of possible tensors to pass into an algorithm ) }; @@ -788,7 +790,8 @@ static void ggml_vk_soft_max( const std::shared_ptr& out, uint32_t inAOff, uint32_t inBOff, uint32_t outOff, int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03, - float scale + float scale, float max_bias, float m0, float m1, + uint32_t n_head_log2 ) { const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv, kp::shader_data::op_softmax_comp_spv_len); @@ -796,12 +799,14 @@ static void ggml_vk_soft_max( struct PushConstants { uint32_t inAOff, inBOff, outOff; int32_t ne00, ne01, ne02; - float scale; + float scale, max_bias, m0, m1; + uint32_t n_head_log2; int32_t mask; } pushConsts { safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), ne00, ne01, ne02, - scale, + scale, max_bias, m0, m1, + n_head_log2, bool(inB) }; @@ -911,9 +916,9 @@ static void ggml_vk_mul_mat_f16( const std::shared_ptr& out, uint32_t inAOff, uint32_t inBOff, uint32_t outOff, int32_t ne00, int32_t ne01, int32_t ne02, - uint32_t nb00, uint32_t nb01, uint32_t nb02, + uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - uint32_t nb10, uint32_t nb11, uint32_t nb12, + uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13, int32_t ne0, int32_t ne1, uint32_t r2, uint32_t r3 ) { @@ -923,17 +928,17 @@ static void ggml_vk_mul_mat_f16( struct PushConstants { uint32_t inAOff, inBOff, outOff; int32_t ne00, ne01, ne02; - uint32_t nb00, nb01, nb02; + uint32_t nb00, nb01, nb02, nb03; int32_t ne10, ne11, ne12; - uint32_t nb10, nb11, nb12; + uint32_t nb10, nb11, nb12, nb13; int32_t ne0, ne1; uint32_t r2, r3; } pushConsts { safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4), ne00, ne01, ne02, - nb00, nb01, nb02, + nb00, nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, + nb10, nb11, nb12, nb13, ne0, ne1, r2, r3 }; @@ -1013,6 +1018,8 @@ static void ggml_vk_mul_mat_impl( int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0, int32_t ne1, + uint32_t nb01, uint32_t nb02, uint32_t nb03, + uint32_t nb11, uint32_t nb12, uint32_t nb13, uint32_t r2, uint32_t r3 ) { struct PushConstants { @@ -1020,19 +1027,23 @@ static void ggml_vk_mul_mat_impl( int32_t ne00, ne01, ne02; int32_t ne10, ne12; int32_t ne0, ne1; + uint32_t nb01, nb02, nb03; + uint32_t nb11, nb12, nb13; uint32_t r2, r3; } pushConsts { safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), ne00, ne01, ne02, ne10, ne12, ne0, ne1, + nb01, nb02, nb03, + nb11, nb12, nb13, r2, r3 }; auto name = std::string(__func__) + "_" + suffix; std::shared_ptr s_algo = nullptr; if (!komputeManager()->hasAlgorithm(name)) { - const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; + const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8; s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts}); } else { s_algo = komputeManager()->getAlgorithm(name); @@ -1074,19 +1085,26 @@ static void ggml_vk_mul_mat_q4_k( const std::shared_ptr& inB, const std::shared_ptr& out, uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10, - int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0, - int32_t ne1, int32_t r2, int32_t r3 + int32_t ne00, int32_t ne01, int32_t ne02, + int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, + int32_t ne0, int32_t ne1, + uint32_t nb01, uint32_t nb02, uint32_t nb03, + uint32_t nb11, uint32_t nb12, uint32_t nb13, + uint32_t r2, uint32_t r3 ) { const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv, kp::shader_data::op_mul_mat_q4_k_comp_spv_len); struct PushConstants { uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3; + int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12; + uint32_t nb01, nb02, nb03, nb11, nb12, nb13; + uint32_t r2, r3; } pushConsts { - 0, 0, 0, - ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3 + inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, ne10, ne0, ne1, ne01, ne02, ne12, + nb01, nb02, nb03, nb11, nb12, nb13, + r2, r3 }; std::shared_ptr s_algo = nullptr; @@ -1108,28 +1126,37 @@ static void ggml_vk_mul_mat_q6_k( const std::shared_ptr& inB, const std::shared_ptr& out, uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1, - int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02 + int32_t ne00, int32_t ne01, int32_t ne02, + int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, + int32_t ne0, int32_t ne1, + uint32_t nb01, uint32_t nb02, uint32_t nb03, + uint32_t nb11, uint32_t nb12, uint32_t nb13, + uint32_t r2, uint32_t r3 ) { const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv, kp::shader_data::op_mul_mat_q6_k_comp_spv_len); struct PushConstants { uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne10, ne0, ne1, ne01, gqa; + int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12; + uint32_t nb01, nb02, nb03, nb11, nb12, nb13; + uint32_t r2, r3; } pushConsts { inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne10, ne0, ne1, ne01, ne12/ne02 + ne00, ne10, ne0, ne1, ne01, ne02, ne12, + nb01, nb02, nb03, nb11, nb12, nb13, + r2, r3 }; std::shared_ptr s_algo = nullptr; if (!komputeManager()->hasAlgorithm(__func__)) { - const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts}); + const uint32_t local_x = 2; + const uint32_t local_y = ggml_vk_current_device().subgroupSize; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts}); } else { s_algo = komputeManager()->getAlgorithm(__func__); s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}); + s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}); s_algo->setPushConstants({pushConsts}); s_algo->updateDescriptors(s_kompute_context->pool.get()); } @@ -1217,10 +1244,11 @@ static void ggml_vk_rope( kp::Sequence& seq, const std::shared_ptr& inA, const std::shared_ptr& inB, + const std::shared_ptr& inC, const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff, ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig, - float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow, + float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow, int32_t ne01, int32_t ne02, int32_t ne03, uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, int32_t ne0, @@ -1228,11 +1256,17 @@ static void ggml_vk_rope( ) { GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32); - static const auto spirv_f16 = getSpirvShader( - kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len + static const auto spirv_norm_f16 = getSpirvShader( + kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len + ); + static const auto spirv_norm_f32 = getSpirvShader( + kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len + ); + static const auto spirv_neox_f16 = getSpirvShader( + kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len ); - static const auto spirv_f32 = getSpirvShader( - kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len + static const auto spirv_neox_f32 = getSpirvShader( + kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len ); int type_size = src0t == GGML_TYPE_F16 ? 2 : 4; @@ -1247,32 +1281,40 @@ static void ggml_vk_rope( GGML_ASSERT(nb0 % type_size == 0); struct PushConstants { - uint32_t inAOff, inBOff, outOff; + uint32_t inAOff, inBOff, inCOff, outOff; int32_t n_dims, mode, n_ctx_orig; - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + float freq_base, freq_scale; + bool has_freq_factors; + float ext_factor, attn_factor, beta_fast, beta_slow; uint32_t nb00, nb01, nb02, nb03; int32_t ne0; uint32_t nb0, nb1, nb2, nb3; } pushConsts { - safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size), + safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size), n_dims, mode, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, + freq_base, freq_scale, + has_freq_factors, + ext_factor, attn_factor, beta_fast, beta_slow, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3 }; - auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32"); + auto & inC_ = inC ? inC : inA; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + const bool is_f16 = src0t == GGML_TYPE_F16; + + auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32"); std::shared_ptr s_algo = nullptr; if (!komputeManager()->hasAlgorithm(name)) { + auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32; s_algo = komputeManager()->algorithm( - name, s_kompute_context->pool.get(), {inA, inB, out}, - src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32, + name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts} ); } else { s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({inA, inB, out}); + s_algo->setTensors({inA, inB, inC_, out}); s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); s_algo->setPushConstants({pushConsts}); s_algo->updateDescriptors(s_kompute_context->pool.get()); @@ -1351,11 +1393,15 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) { } static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + int64_t n = ggml_nelements(op); switch (op->op) { case GGML_OP_UNARY: + if (n % 4 != 0) return false; switch (ggml_get_unary_op(op)) { - case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_GELU: + if (n % 8 != 0) return false; + // fall through + case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_SILU: return ggml_is_contiguous(op->src[0]); default: @@ -1413,8 +1459,8 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons switch (op->src[0]->type) { case GGML_TYPE_F32: - case GGML_TYPE_Q6_K: return op->ne[3] == 1; + case GGML_TYPE_Q6_K: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: @@ -1515,9 +1561,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml const static std::shared_ptr nullTensor = nullptr; uint32_t off_src0 = 0; uint32_t off_src1 = 0; + uint32_t off_src2 = 0; uint32_t off_dst = 0; const std::shared_ptr& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor; const std::shared_ptr& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor; + const std::shared_ptr& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor; const std::shared_ptr& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor; switch (dst->op) { @@ -1593,11 +1641,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32); -#pragma message("TODO: add ALiBi support") -#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192") - GGML_ASSERT(max_bias == 0.0f); + const int64_t nrows_x = ggml_nrows(src0); + const int64_t nrows_y = src0->ne[1]; + + const uint32_t n_head = nrows_x/nrows_y; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); - ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale); + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2); } break; case GGML_OP_DIAG_MASK_INF: { @@ -1649,38 +1702,44 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml case GGML_TYPE_F16: ggml_vk_mul_mat_f16( seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12, + ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13, ne0, ne1, r2, r3 ); break; case GGML_TYPE_Q8_0: ggml_vk_mul_mat_q8_0( seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 + ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, + nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 ); break; case GGML_TYPE_Q4_0: ggml_vk_mul_mat_q4_0( seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 + ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, + nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 ); break; case GGML_TYPE_Q4_1: ggml_vk_mul_mat_q4_1( seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 + ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, + nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 ); break; case GGML_TYPE_Q4_K: ggml_vk_mul_mat_q4_k( seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03 + ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, + nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 ); break; case GGML_TYPE_Q6_K: ggml_vk_mul_mat_q6_k( seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02 + ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, + nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 ); break; default: { @@ -1709,13 +1768,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml } break; case GGML_OP_ROPE: { -#pragma message("TODO: implement phi3 frequency factors support") -#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225") - GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet"); - -#pragma message("TODO: update rope NORM mode to match NEOX mode") -#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634") - GGML_ASSERT(ne10 == ne02); GGML_ASSERT(src0t == dstt); // const int n_past = ((int32_t *) dst->op_params)[0]; @@ -1724,6 +1776,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; + const bool has_freq_factors = dst->src[2] != nullptr; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); @@ -1732,8 +1786,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); ggml_vk_rope( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, + seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig, + freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3 ); } break; diff --git a/ggml/src/ggml-kompute/kompute-shaders/common.comp b/ggml/src/ggml-kompute/kompute-shaders/common.comp index 2aaddf704a758..dbe4cf804e6c0 100644 --- a/ggml/src/ggml-kompute/kompute-shaders/common.comp +++ b/ggml/src/ggml-kompute/kompute-shaders/common.comp @@ -3,6 +3,7 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #extension GL_EXT_shader_explicit_arithmetic_types_int8: require #extension GL_EXT_shader_explicit_arithmetic_types_int16: require +#extension GL_EXT_shader_explicit_arithmetic_types_int64: require #extension GL_EXT_control_flow_attributes: enable #extension GL_KHR_shader_subgroup_arithmetic : require #extension GL_EXT_debug_printf : enable diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp index 8f0a9031f7a37..0ab1b2fc20eeb 100644 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp @@ -20,12 +20,14 @@ layout (push_constant) uniform parameter { uint nb00; uint nb01; uint nb02; + uint nb03; int ne10; int ne11; int ne12; uint nb10; uint nb11; uint nb12; + uint nb13; int ne0; int ne1; uint r2; @@ -42,7 +44,7 @@ void main() { const uint i12 = im%pcs.ne12; const uint i13 = im/pcs.ne12; - const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02; + const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03; const uint x = offset0 / 2 + pcs.inAOff; // Based from inA @@ -52,7 +54,7 @@ void main() { break; } - const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB + const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; float sumf = 0; for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp index fc8e45aa97776..a5752a3a0065f 100644 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp @@ -24,8 +24,14 @@ layout (push_constant) uniform parameter { int ne01; int ne02; int ne12; - int r2; - int r3; + uint nb01; + uint nb02; + uint nb03; + uint nb11; + uint nb12; + uint nb13; + uint r2; + uint r3; } pcs; void main() { @@ -50,10 +56,11 @@ void main() { const uint i12 = im%pcs.ne12; const uint i13 = im/pcs.ne12; - const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02); + const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); + const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13; - const uint xblk = ib_row + offset0 + pcs.inAOff; - const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; + const uint xblk = offset0 + pcs.inAOff; + const uint y = (offset1 / 4) + pcs.inBOff; float yl[16]; float yh[16]; @@ -74,7 +81,7 @@ void main() { } for (int row = 0; row < N_DST; row++) { - uint row_idx = row * nb; + uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK); uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0); uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2); diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp index c9baebdf4baac..d331d1a70572e 100644 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp @@ -21,7 +21,16 @@ layout (push_constant) uniform parameter { int ne0; int ne1; int ne01; - int gqa; + int ne02; + int ne12; + uint nb01; + uint nb02; + uint nb03; + uint nb11; + uint nb12; + uint nb13; + uint r2; + uint r3; } pcs; void main() { @@ -34,12 +43,15 @@ void main() { const uint r0 = gl_WorkGroupID.x; const uint r1 = gl_WorkGroupID.y; - const uint r2 = gl_WorkGroupID.z; + const uint im = gl_WorkGroupID.z; const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID); - const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0); - const uint x = row * nb + offset0; // Based from inA without base offset - const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB + + const uint i12 = im%pcs.ne12; + const uint i13 = im/pcs.ne12; + + const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); + const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; float sumf = 0; @@ -89,6 +101,6 @@ void main() { const float tot = subgroupAdd(sumf); if (subgroupElect()) { - out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot; + out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot; } } diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp index 440b5ab2c81f8..a6517cc1f1993 100644 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp @@ -14,10 +14,15 @@ void main() { const uint i12 = im%pcs.ne12; const uint i13 = im/pcs.ne12; - const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02); + // pointers to src0 rows + uint ax[N_ROWS]; + for (int row = 0; row < N_ROWS; ++row) { + const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); + + ax[row] = offset0 + pcs.inAOff; + } - const uint x = offset0; // Based from inA without base offset - const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB + const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f}; @@ -32,8 +37,7 @@ void main() { for (uint ib = ix; ib < nb; ib += 16) { for (int row = 0; row < N_ROWS; row++) { - const uint block_index = x + ib + row * nb; - sumf[row] += block_q_n_dot_y(block_index, yb, il); + sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il); } yb += BLOCKS_IN_QUANT * 16; diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp index 7912b09ac69c4..a9a2f22180ffd 100644 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp @@ -1,5 +1,5 @@ layout(local_size_x_id = 0) in; -layout(local_size_y = 1) in; +layout(local_size_y = 8) in; layout(local_size_z = 1) in; layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; @@ -17,6 +17,12 @@ layout (push_constant) uniform parameter { int ne12; int ne0; int ne1; + uint nb01; + uint nb02; + uint nb03; + uint nb11; + uint nb12; + uint nb13; uint r2; uint r3; } pcs; diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp deleted file mode 100644 index 0ecfb2eab527c..0000000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f16.comp +++ /dev/null @@ -1,73 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - const int p = inB[pcs.inBOff + i2]; - - float theta = float(p); - - if (!is_neox) { - for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) { - float cos_theta, sin_theta; - rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - theta *= theta_scale; - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - const float x0 = float(inA[src]); - const float x1 = float(inA[src+1]); - - out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); - out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta); - } - } else { - const float inv_ndims = -1.f/pcs.n_dims; - for (uint ic = 0; ic < pcs.n_dims; ic += 2) { - const uint cur_rot = ic; - - float cos_theta, sin_theta; - rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - theta *= theta_scale; - - const uint i0 = ic/2; - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - const float x0 = float(inA[src]); - const float x1 = float(inA[src+pcs.n_dims/2]); - - out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); - out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta); - } - - for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) { - const uint i0 = ic; - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - out_[dst_data + 0] = inA[src + 0]; - out_[dst_data + 1] = inA[src + 1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp deleted file mode 100644 index cec0fd9a5d10c..0000000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_f32.comp +++ /dev/null @@ -1,73 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - const int p = inB[pcs.inBOff + i2]; - - float theta = float(p); - - if (!is_neox) { - for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) { - float cos_theta, sin_theta; - rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - theta *= theta_scale; - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - const float x0 = inA[src]; - const float x1 = inA[src+1]; - - out_[dst_data] = x0*cos_theta - x1*sin_theta; - out_[dst_data+1] = x0*sin_theta + x1*cos_theta; - } - } else { - const float inv_ndims = -1.f/pcs.n_dims; - for (uint ic = 0; ic < pcs.n_dims; ic += 2) { - const uint cur_rot = ic; - - float cos_theta, sin_theta; - rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - theta *= theta_scale; - - const uint i0 = ic/2; - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - const float x0 = inA[src]; - const float x1 = inA[src+pcs.n_dims/2]; - - out_[dst_data] = x0*cos_theta - x1*sin_theta; - out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta; - } - - for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) { - const uint i0 = ic; - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - out_[dst_data + 0] = inA[src + 0]; - out_[dst_data + 1] = inA[src + 1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp new file mode 100644 index 0000000000000..63659cbfe5524 --- /dev/null +++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp @@ -0,0 +1,52 @@ +#version 450 + +#include "rope_common.comp" + +layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; +layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; +layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; }; + +void main() { + const uint i3 = gl_WorkGroupID.z; + const uint i2 = gl_WorkGroupID.y; + const uint i1 = gl_WorkGroupID.x; + + float corr_dims[2]; + rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); + + const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); + + float theta_base = float(inB[pcs.inBOff + i2]); + float inv_ndims = -1.f/pcs.n_dims; + + float cos_theta; + float sin_theta; + + for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { + if (i0 < pcs.n_dims) { + uint ic = i0/2; + + float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); + + const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; + + rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_ + + const float x0 = float(inA[src]); + const float x1 = float(inA[src+pcs.n_dims/2]); + + out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); + out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta); + } else { + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ + + out_[dst_data] = inA[src]; + out_[dst_data+1] = inA[src+1]; + } + } +} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp new file mode 100644 index 0000000000000..4df56204d7233 --- /dev/null +++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp @@ -0,0 +1,52 @@ +#version 450 + +#include "rope_common.comp" + +layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; +layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; +layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; }; + +void main() { + const uint i3 = gl_WorkGroupID.z; + const uint i2 = gl_WorkGroupID.y; + const uint i1 = gl_WorkGroupID.x; + + float corr_dims[2]; + rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); + + const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); + + float theta_base = float(inB[pcs.inBOff + i2]); + float inv_ndims = -1.f/pcs.n_dims; + + float cos_theta; + float sin_theta; + + for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { + if (i0 < pcs.n_dims) { + uint ic = i0/2; + + float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); + + const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; + + rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_ + + const float x0 = inA[src]; + const float x1 = inA[src+pcs.n_dims/2]; + + out_[dst_data] = x0*cos_theta - x1*sin_theta; + out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ + + out_[dst_data] = inA[src]; + out_[dst_data+1] = inA[src+1]; + } + } +} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp new file mode 100644 index 0000000000000..a3c0eda8bd399 --- /dev/null +++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp @@ -0,0 +1,52 @@ +#version 450 + +#include "rope_common.comp" + +layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; +layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; +layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; }; + +void main() { + const uint i3 = gl_WorkGroupID.z; + const uint i2 = gl_WorkGroupID.y; + const uint i1 = gl_WorkGroupID.x; + + float corr_dims[2]; + rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); + + const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); + + float theta_base = float(inB[pcs.inBOff + i2]); + float inv_ndims = -1.f/pcs.n_dims; + + float cos_theta; + float sin_theta; + + for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { + if (i0 < pcs.n_dims) { + uint ic = i0/2; + + float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); + + const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; + + rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ + + const float x0 = float(inA[src]); + const float x1 = float(inA[src+1]); + + out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); + out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta); + } else { + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ + + out_[dst_data] = inA[src]; + out_[dst_data+1] = inA[src+1]; + } + } +} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp new file mode 100644 index 0000000000000..b7963ae725390 --- /dev/null +++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp @@ -0,0 +1,52 @@ +#version 450 + +#include "rope_common.comp" + +layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; +layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; +layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; }; + +void main() { + const uint i3 = gl_WorkGroupID.z; + const uint i2 = gl_WorkGroupID.y; + const uint i1 = gl_WorkGroupID.x; + + float corr_dims[2]; + rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); + + const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); + + float theta_base = float(inB[pcs.inBOff + i2]); + float inv_ndims = -1.f/pcs.n_dims; + + float cos_theta; + float sin_theta; + + for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { + if (i0 < pcs.n_dims) { + uint ic = i0/2; + + float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); + + const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; + + rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ + + const float x0 = inA[src]; + const float x1 = inA[src+1]; + + out_[dst_data] = x0*cos_theta - x1*sin_theta; + out_[dst_data+1] = x0*sin_theta + x1*cos_theta; + } else { + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ + + out_[dst_data] = inA[src]; + out_[dst_data+1] = inA[src+1]; + } + } +} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp index 7bc9176cabaae..4165295bf4b3c 100644 --- a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +++ b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp @@ -18,6 +18,10 @@ layout(push_constant) uniform PushConstants { int ne01; int ne02; float scale; + float max_bias; + float m0; + float m1; + uint n_head_log2; int mask; } pcs; @@ -34,17 +38,29 @@ void main() { const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB const uint pdst = extra_off + pcs.outOff; // Based from out_ + float slope = 1.0f; + + // ALiBi + if (pcs.max_bias > 0.0f) { + int64_t h = i02; + + float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1; + int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1; + + slope = pow(base, float(exp)); + } + // parallel max float localMax = uintBitsToFloat(0xFF800000); for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { - localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f)); + localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f)); } float max_ = subgroupMax(localMax); // parallel sum float localSum = 0.0f; for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { - const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_); + const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_); localSum += exp_psrc0; out_[pdst + i00] = exp_psrc0; } diff --git a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp index df4702896d46f..0fca640dcc232 100644 --- a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +++ b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp @@ -8,12 +8,14 @@ layout(local_size_x = 1) in; layout (push_constant) uniform parameter { uint inAOff; uint inBOff; + uint inCOff; uint outOff; int n_dims; int mode; int n_ctx_orig; float freq_base; float freq_scale; + bool has_freq_factors; float ext_factor; float attn_factor; float beta_fast; From c202cef1686182a78f8f4e253ab8d0c0ffe2fcc8 Mon Sep 17 00:00:00 2001 From: Shupei Fan Date: Thu, 28 Nov 2024 20:52:03 +0800 Subject: [PATCH 07/19] ggml-cpu: support IQ4_NL_4_4 by runtime repack (#10541) * ggml-cpu: support IQ4_NL_4_4 by runtime repack * ggml-cpu: add __ARM_FEATURE_DOTPROD guard --- ggml/include/ggml-cpu.h | 1 + ggml/include/ggml.h | 3 + ggml/src/ggml-common.h | 6 + ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 321 +++++++++++++++++++++++++-- ggml/src/ggml-cpu/ggml-cpu-aarch64.h | 2 + ggml/src/ggml-cpu/ggml-cpu.c | 27 ++- ggml/src/ggml-cpu/ggml-cpu.cpp | 2 +- ggml/src/ggml.c | 9 + 8 files changed, 352 insertions(+), 19 deletions(-) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index a5358d047a08e..e14ea9ea5301f 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -91,6 +91,7 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_neon (void); GGML_BACKEND_API int ggml_cpu_has_arm_fma (void); GGML_BACKEND_API int ggml_cpu_has_fp16_va (void); + GGML_BACKEND_API int ggml_cpu_has_dotprod (void); GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void); GGML_BACKEND_API int ggml_cpu_has_sve (void); GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 9843b09fbe83e..65cb92c444bb7 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -389,6 +389,9 @@ extern "C" { GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, + GGML_TYPE_IQ4_NL_4_4 = 36, + // GGML_TYPE_IQ4_NL_4_8 = 37, + // GGML_TYPE_IQ4_NL_8_8 = 38, GGML_TYPE_COUNT, }; diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 050161393456e..27253a6c2b3ca 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -418,6 +418,12 @@ typedef struct { } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); +typedef struct { + ggml_half d[4]; // deltas for 4 iq4_nl blocks + uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks +} block_iq4_nlx4; +static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); + #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c index 96a16dfba1f65..ced3788790671 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c @@ -187,6 +187,8 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) } #endif +static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); @@ -528,7 +530,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - if (ggml_cpu_has_neon()) { + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -996,6 +998,102 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } +void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { + const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + float * res_ptr = s; + + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + float32x4_t sumf = vdupq_n_f32(0); + for (int l = 0; l < nb; l++) { + uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0); + uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16); + uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32); + uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48); + + int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4); + int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F); + int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4); + int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F); + int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4); + int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F); + int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4); + int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F); + + int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0); + int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16); + + int32x4_t sumi = vdupq_n_s32(0); + sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0); + sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0); + sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1); + sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1); + sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2); + sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2); + sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3); + sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3); + + float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d)); + float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d)); + float32x4_t d = a_d * b_d; + + sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi)); + } + + vst1q_f32(res_ptr + x * 4, sumf); + } + return; + } +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) + { + float sumf[4]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; + const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); + } + sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } + } +} + void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; @@ -1017,7 +1115,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - if (ggml_cpu_has_neon()) { + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -3386,6 +3484,117 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } +void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { + const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + float32x4_t sumf[4]; + for (int m = 0; m < 4; m++) { + sumf[m] = vdupq_n_f32(0); + } + + for (int l = 0; l < nb; l++) { + float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d)); + float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d)); + + int32x4_t sumi_0 = vdupq_n_s32(0); + int32x4_t sumi_1 = vdupq_n_s32(0); + int32x4_t sumi_2 = vdupq_n_s32(0); + int32x4_t sumi_3 = vdupq_n_s32(0); + + for (int k = 0; k < 4; k++) { + int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0); + int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64); + + uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k); + int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4); + int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF); + + sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0); + sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1); + sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2); + sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3); + sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0); + sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1); + sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2); + sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3); + } + + sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0)); + sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1)); + sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2)); + sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3)); + } + + for (int m = 0; m < 4; m++) { + vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]); + } + } + } + return; + } +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) + { + float sumf[4][4]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; + const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); + } + sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } + } +} + // FIXME: this code is duplicated from ggml-aarch64.c static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { block_q4_0x4 out; @@ -3518,6 +3727,70 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, GGML_UNUSED(data_size); } +static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) { + block_iq4_nlx4 out; + + for (int i = 0; i < 4; i++) { + out.d[i] = in[i].d; + } + + const int end = QK4_NL * 2 / blck_size_interleave; + + if (blck_size_interleave == 8) { + for (int i = 0; i < end; ++i) { + int src_id = i % 4; + int src_offset = (i / 4) * blck_size_interleave; + int dst_offset = i * blck_size_interleave; + + // Using memcpy to avoid unaligned memory accesses + memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); + } + } else if (blck_size_interleave == 4) { + for (int i = 0; i < end; ++i) { + int src_id = i % 4; + int src_offset = (i / 4) * blck_size_interleave; + int dst_offset = i * blck_size_interleave; + + memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t)); + } + } else { + GGML_ASSERT(false); + } + + return out; +} + +static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); + GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + + block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data; + const block_iq4_nl * src = (const block_iq4_nl *)data; + block_iq4_nl dst_tmp[4]; + int nrow = t->ne[1]; // Number of rows + int nrows_interleaved = 4; + int nblocks = t->ne[0] / QK4_0; + + GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); + + if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++) { + dst_tmp[i] = src[x + i * nblocks]; + } + *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block); + } + src += nrows_interleaved * nblocks; + } + return 0; + + GGML_UNUSED(data_size); +} + // Prepare for optimized kernels if applicable void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) { if (cur->type == repack_type) { @@ -3525,20 +3798,30 @@ void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_ return; } - GGML_ASSERT(cur->type == GGML_TYPE_Q4_0); - - switch (repack_type) { - case GGML_TYPE_Q4_0_8_8: - repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size); - break; - case GGML_TYPE_Q4_0_4_8: - repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size); - break; - case GGML_TYPE_Q4_0_4_4: - repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size); - break; - default: - GGML_ABORT("Unsupported type"); + if (cur->type == GGML_TYPE_Q4_0) { + switch (repack_type) { + case GGML_TYPE_Q4_0_8_8: + repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size); + break; + case GGML_TYPE_Q4_0_4_8: + repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size); + break; + case GGML_TYPE_Q4_0_4_4: + repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size); + break; + default: + GGML_ABORT("Unsupported type"); + } + } else if (cur->type == GGML_TYPE_IQ4_NL) { + switch (repack_type) { + case GGML_TYPE_IQ4_NL_4_4: + repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size); + break; + default: + GGML_ABORT("Unsupported type"); + } + } else { + GGML_ABORT("Unsupported type"); } } @@ -3551,9 +3834,13 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { return GGML_TYPE_Q4_0_4_8; } - if (ggml_cpu_has_neon()) { + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { return GGML_TYPE_Q4_0_4_4; } + } else if (cur->type == GGML_TYPE_IQ4_NL) { + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { + return GGML_TYPE_IQ4_NL_4_4; + } } return cur->type; diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h index 53b30c1dd2dfe..3d9db6a19eb87 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h @@ -15,11 +15,13 @@ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); // GEMM void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size); enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index c6ede19d9d1c0..fea867440424e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -109,10 +109,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; #if defined(__ARM_ARCH) struct ggml_arm_arch_features_type { int has_neon; + int has_dotprod; int has_i8mm; int has_sve; int sve_cnt; -} ggml_arm_arch_features = {-1, -1, -1, 0}; +} ggml_arm_arch_features = {-1, -1, -1, -1, 0}; #endif @@ -446,6 +447,15 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ4_NL_4_4] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + .ncols = 4, + .gemv = ggml_gemv_iq4_nl_4x4_q8_0, + .gemm = ggml_gemm_iq4_nl_4x4_q8_0, + }, }; const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { @@ -2439,6 +2449,7 @@ static void ggml_init_arm_arch_features(void) { uint32_t hwcap2 = getauxval(AT_HWCAP2); ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); + ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP); ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); @@ -2453,6 +2464,11 @@ static void ggml_init_arm_arch_features(void) { } ggml_arm_arch_features.has_neon = oldp; + if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_arm_arch_features.has_dotprod = oldp; + if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { oldp = 0; } @@ -9133,6 +9149,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: + case GGML_TYPE_IQ4_NL_4_4: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -13880,6 +13897,14 @@ int ggml_cpu_has_neon(void) { #endif } +int ggml_cpu_has_dotprod(void) { +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD) + return ggml_arm_arch_features.has_dotprod; +#else + return 0; +#endif +} + int ggml_cpu_has_sve(void) { #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) return ggml_arm_arch_features.has_sve; diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index febed433ada2b..44d99089a490c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -457,7 +457,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st const struct ggml_tensor * src1 = op->src[1]; if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { - if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) { + if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) { return false; } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1a2318cb188c4..1a9a7efaf7f39 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -831,6 +831,15 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_tq2_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, }, + [GGML_TYPE_IQ4_NL_4_4] = { + .type_name = "iq4_nl_4x4", + .blck_size = QK4_NL, + .blck_size_interleave = 4, + .type_size = sizeof(block_iq4_nl), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { From eea986f215e1dc490654d012ccf2ab62fe8f606d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 28 Nov 2024 14:56:23 +0200 Subject: [PATCH 08/19] cmake : fix ARM feature detection (#10543) ggml-ci --- ggml/src/ggml-cpu/CMakeLists.txt | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index ddc05ecef6f5c..4dbc1f75b647f 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -82,17 +82,23 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD) add_compile_definitions(__ARM_FEATURE_DOTPROD) + + message(STATUS "ARM feature DOTPROD enabled") endif () - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) if (GGML_COMPILER_SUPPORT_MATMUL_INT8) add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + + message(STATUS "ARM feature MATMUL_INT8 enabled") endif () check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + + message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") endif () set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) @@ -113,17 +119,23 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR if (GGML_COMPILER_SUPPORT_DOTPROD) set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") add_compile_definitions(__ARM_FEATURE_DOTPROD) + + message(STATUS "ARM feature DOTPROD enabled") endif () set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) if (GGML_COMPILER_SUPPORT_MATMUL_INT8) set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + + message(STATUS "ARM feature MATMUL_INT8 enabled") endif () + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") From 76b27d29c22af03172cf211a8a31025c7c828a57 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 28 Nov 2024 14:56:37 +0200 Subject: [PATCH 09/19] ggml : fix row condition for i8mm kernels (#10561) ggml-ci --- ggml/src/ggml-cpu/ggml-cpu-quants.c | 6 ++++-- ggml/src/ggml-cpu/ggml-cpu.c | 17 +++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index f0e276b698795..11e8df253d5ca 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -1813,11 +1813,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), l1, r1)), l2, r2)), l3, r3))), scale); } - float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + + float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); - vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s, vget_low_f32 (sumv2)); vst1_f32(s + bs, vget_high_f32(sumv2)); + return; } #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index fea867440424e..1c88e5d81ab6c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -7576,14 +7576,6 @@ UseGgmlGemm2:; // This is the size of the rest of the dimensions of the result const int64_t nr1 = ne1 * ne2 * ne3; - // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols - int64_t num_rows_per_vec_dot = vec_dot_num_rows; - // TODO: currently the mmla kernels support only even numbered rows/cols. - // this check can be removed once they are extended to support odd numbered rows/cols too - if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { - num_rows_per_vec_dot = 1; - } - // Now select a reasonable chunk size. int chunk_size = 16; @@ -7646,6 +7638,15 @@ UseGgmlGemm2:; const int64_t ir1_start = dr1 * ith1; const int64_t ir1_end = MIN(ir1_start + dr1, nr1); + // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols + int64_t num_rows_per_vec_dot = vec_dot_num_rows; + + // TODO: currently the mmla kernels support only even numbered rows/cols. + // this check can be removed once they are extended to support odd numbered rows/cols too + if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) { + num_rows_per_vec_dot = 1; + } + ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); if (nth >= nchunk0 * nchunk1) { From e90688edd004fdb7063f463bd18408ba9ae008dd Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 28 Nov 2024 15:58:54 +0100 Subject: [PATCH 10/19] ci : fix tag name in cuda and hip releases (#10566) --- .github/workflows/build.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e02b5c6200a82..48953dafa0744 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -904,6 +904,8 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Install Cuda Toolkit 11.7 if: ${{ matrix.cuda == '11.7' }} @@ -1139,6 +1141,8 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Install id: depends From 7281cf13addfae9b64bb2be87e3b5b1914505d63 Mon Sep 17 00:00:00 2001 From: Random Fly Date: Thu, 28 Nov 2024 23:03:11 +0800 Subject: [PATCH 11/19] docs: fix outdated usage of llama-simple (#10565) --- docs/android.md | 4 ++-- examples/simple/README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/android.md b/docs/android.md index 320b62240382f..47530c6c1d478 100644 --- a/docs/android.md +++ b/docs/android.md @@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf Then, if you are not already in the repo directory, `cd` into `llama.cpp` and: ``` -$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" +$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" ``` -Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. +Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone: diff --git a/examples/simple/README.md b/examples/simple/README.md index 0ff3425359a41..937008b243ee4 100644 --- a/examples/simple/README.md +++ b/examples/simple/README.md @@ -3,7 +3,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt. ```bash -./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" +./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" ... From 890719311b6535e572f15965c6d7ec4ac2537f60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 28 Nov 2024 18:15:25 +0100 Subject: [PATCH 12/19] common: fix warning message when no GPU found (#10564) --- common/arg.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a6b7a1394f735..32d9a964c1716 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1370,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); + fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); } } ).set_env("LLAMA_ARG_N_GPU_LAYERS")); @@ -2104,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.speculative.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); + fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); From 6c595676899013102fdb0aa4b06a49954300c94a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 28 Nov 2024 19:17:49 +0100 Subject: [PATCH 13/19] server : (tests) don't use thread for capturing stdout/stderr, bump openai client library (#10568) * server : (tests) don't use thread for capturing stdout/stderr * test: bump openai to 1.55.2 * bump openai to 1.55.3 --- examples/server/tests/requirements.txt | 2 +- examples/server/tests/utils.py | 19 ++----------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 935a79114b45e..074b9d47bddce 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -2,6 +2,6 @@ aiohttp~=3.9.3 pytest~=8.3.3 huggingface_hub~=0.23.2 numpy~=1.26.4 -openai~=1.30.3 +openai~=1.55.3 prometheus-client~=0.20.0 requests~=2.32.3 diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index e31743c505d8e..a831f113f4161 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -8,7 +8,6 @@ import re import json import sys -import threading import requests import time from concurrent.futures import ThreadPoolExecutor, as_completed @@ -161,26 +160,12 @@ def start(self, timeout_seconds: int = 10) -> None: self.process = subprocess.Popen( [str(arg) for arg in [server_path, *server_args]], creationflags=flags, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stdout=sys.stdout, + stderr=sys.stdout, env={**os.environ, "LLAMA_CACHE": "tmp"}, ) server_instances.add(self) - def server_log(in_stream, out_stream): - for line in iter(in_stream.readline, b""): - print(line.decode("utf-8"), end="", file=out_stream) - - thread_stdout = threading.Thread( - target=server_log, args=(self.process.stdout, sys.stdout), daemon=True - ) - thread_stdout.start() - - thread_stderr = threading.Thread( - target=server_log, args=(self.process.stderr, sys.stderr), daemon=True - ) - thread_stderr.start() - print(f"server pid={self.process.pid}, pytest pid={os.getpid()}") # wait for server to start From 4c0a95b1074907ce7efe6f5bb6ae3351c01429ab Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 28 Nov 2024 20:45:07 +0200 Subject: [PATCH 14/19] llama : add missing model types --- src/llama.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index af5e686e07eda..22b951ba2a946 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2341,6 +2341,7 @@ enum e_model { MODEL_16B, MODEL_20B, MODEL_30B, + MODEL_32B, MODEL_34B, MODEL_35B, MODEL_40B, @@ -5330,6 +5331,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_16B: return "16B"; case MODEL_20B: return "20B"; case MODEL_30B: return "30B"; + case MODEL_32B: return "32B"; case MODEL_34B: return "34B"; case MODEL_35B: return "35B"; case MODEL_40B: return "40B"; @@ -5690,7 +5692,10 @@ static void llm_load_hparams( case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break; case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break; + case 36: model.type = e_model::MODEL_3B; break; case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break; + case 48: model.type = e_model::MODEL_14B; break; + case 64: model.type = e_model::MODEL_32B; break; case 80: model.type = e_model::MODEL_70B; break; default: model.type = e_model::MODEL_UNKNOWN; } From dc22344088a7ee81a1e4f096459b03a72f24ccdc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 28 Nov 2024 20:46:40 +0200 Subject: [PATCH 15/19] ggml : remove redundant copyright notice + update authors --- AUTHORS | 186 ++++++++++++++++++++++++++- ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 4 - 2 files changed, 185 insertions(+), 5 deletions(-) diff --git a/AUTHORS b/AUTHORS index 1bd36158a72f4..2eb60806ad058 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,4 @@ -# date: Wed Jun 26 19:36:34 EEST 2024 +# date: Thu Nov 28 20:46:15 EET 2024 # this file is auto-generated by scripts/gen-authors.sh 0cc4m @@ -7,6 +7,7 @@ 2f38b454 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> 44670 <44670@users.noreply.github.com> +65a <10104049+65a@users.noreply.github.com> AN Long AT Aarni Koskela @@ -19,20 +20,28 @@ Adithya Balaji AdithyanI Adrian Adrian Hesketh +Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com> Ahmet Zeer AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> +AidanBeltonS Aisuko +Akarshan Biswas Akarshan Biswas +Al Mochkin <14274697+amochkin@users.noreply.github.com> Albert Jin Alberto <57916483+albbus-stack@users.noreply.github.com> +Alberto Cabrera Pérez +Alberto Cabrera Pérez Alex Alex Azarov Alex Azarov Alex Klinkhamer Alex Klinkhamer Alex Nguyen +Alex O'Connell <35843486+acon96@users.noreply.github.com> Alex Petenchea Alex Renda +Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com> Alex von Gluck IV Alexey Parfenov Ali Chraghi <63465728+alichraghi@users.noreply.github.com> @@ -45,18 +54,25 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> Ananta Bastola Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> András Salamon +Andreas (Andi) Kunar Andrei Andrew Canis Andrew Downing Andrew Duffy Andrew Godfrey +Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com> +Andy Salerno Andy Tai +Anthony Van de Gejuchte +Antonis Makropoulos Arik Poznanski +Armen Kaleshian Artem Artem Zinnatullin Artyom Lebedev Asbjørn Olling Ásgeir Bjarni Ingvarsson +Asghar Ghorbani Ashish <1856117+ashishdatta@users.noreply.github.com> Ashok Gelal <401055+ashokgelal@users.noreply.github.com> Ashraful Islam @@ -76,12 +92,16 @@ Ben Williams Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> Bernat Vadell +Bert Wagner Bingan <70050083+binganao@users.noreply.github.com> +Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com> Bodo Graumann Bono Lv Borislav Stanimirov Branden Butler +Brandon Squizzato <35474886+bsquizz@users.noreply.github.com> Brian +Brian Cunnie Bruce MacDonald Bryan Honof CJ Pais @@ -90,32 +110,47 @@ Calvin Laurenson Cameron Cameron Kaiser Carolinabanana <140120812+Carolinabanana@users.noreply.github.com> +CarryFun <76023481+CarryFun@users.noreply.github.com> +Carsten Kragelund Jørgensen +CarterLi999 <664681047@qq.com> Casey Primozic Casey Primozic CausalLM <148736309+CausalLM@users.noreply.github.com> Cebtenzzre Chad Brewbaker +Changyeon Kim Chao Jiang +Charles Xu <63788048+chaxu01@users.noreply.github.com> +Charles Xu +Chen Xi +Chen Xi Cheng Shao +Chenguang Li <87689256+noemotiovon@users.noreply.github.com> Chris Elrod Chris Kuehl Christian Demsar Christian Demsar Christian Falch <875252+chrfalch@users.noreply.github.com> Christian Kögler +Christian Köhnenkamp Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com> Clint Herron +Conrad Kramer CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> +Csaba Kecskemeti Cuong Trinh Manh DAN™ Damian Stewart +Dan Johansson <164997844+eddnjjn@users.noreply.github.com> +Dan Johansson Dane Madsen DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> Daniel Bevenius Daniel Drake Daniel Hiltgen Daniel Illescas Romero +Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Daniele <57776841+daniandtheweb@users.noreply.github.com> DannyDaemonic Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> @@ -129,19 +164,28 @@ David Pflug David Renshaw David Sommers <12738+databyte@users.noreply.github.com> David Yang +DavidKorczynski Dawid Potocki Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> Dean Deins +Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com> +Derrick T. Woolworth Deven Mistry <31466137+deven367@users.noreply.github.com> +Dibakar Gope Didzis Gosko +Diego Devesa +Diogo Teles Sant'Anna Djip007 Don Mahurin DooWoong Lee (David) Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com> +Dou Xinpeng <15529241576@163.com> +Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com> Douglas Hanley Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com> Ebey Abraham +Echo Nolan Ed Lee Ed Lepedus Eddie-Wang @@ -151,10 +195,13 @@ Elbios <141279586+Elbios@users.noreply.github.com> Elton Kola Engininja2 <139037756+Engininja2@users.noreply.github.com> Equim +Eric Curtin +Eric Curtin Eric Sommerlade Eric Zhang <34133756+EZForever@users.noreply.github.com> Erik Garrison Erik Scholz +Esko Toivonen Ettore Di Giacinto Evan Jones Evan Miller @@ -166,19 +213,26 @@ FK Fabian Fabio R. Sluzala Faez Shakil +Faisal Zaghloul +Faisal Zaghloul +Fan Shupei FantasyGmm <16450052+FantasyGmm@users.noreply.github.com> +Farbod Bijary <110523279+farbodbj@users.noreply.github.com> Fattire <528174+fat-tire@users.noreply.github.com> Felix Finn Voorhees Firat +FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com> Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> Francisco Melo <43780565+francis2tm@users.noreply.github.com> Frank Mai FrankHB +Frankie Robertson Fred Douglas <43351173+fredlas@users.noreply.github.com> Frederik Vogel Gabe Goodhart +Gabe Goodhart GainLee Galunid Gary Linscott @@ -187,11 +241,13 @@ Gavin Zhao Genkagaku.GPT Georgi Gerganov Gilad S +Gilad S. <7817232+giladgd@users.noreply.github.com> Giuseppe Scrivano GiviMAD Govlzkoy Guillaume "Vermeille" Sanchez Guillaume Wenzek +Guoliang Hua <32868157+nbcsm@users.noreply.github.com> Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Haggai Nuchi @@ -213,11 +269,14 @@ Hong Bo PENG Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> Howard Su Hua Jiang +Huang Qi Huawei Lin Hugo Roussel +Huifeng Ou <79071290+ho2103@users.noreply.github.com> Ian Bull Ian Bull Ian Scrivener +Icecream95 Ido S IgnacioFDM Igor Okulist @@ -226,11 +285,15 @@ Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Ionoclast Laboratories Isaac McFadyen IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com> +Ivan +Ivan Filipov <159561759+vanaka11@users.noreply.github.com> Ivan Komarov Ivan Stepanov JH23X <165871467+JH23X@users.noreply.github.com> +Jack Mousseau Jack Mousseau JackJollimore <130917767+JackJollimore@users.noreply.github.com> +Jaeden Amero Jaemin Son Jag Chadha Jakub N @@ -243,10 +306,14 @@ Jannis Schönleber Jared Van Bortel Jared Van Bortel Jason McCartney +Jason Stillerman Jean-Christophe Hoelt Jean-Michaël Celerier Jed Fox +Jeff Bolz +Jeffrey Morgan Jeffrey Quesnelle +Jeroen Mostert Jesse Jojo Johnson Jeximo Jhen-Jie Hong @@ -258,6 +325,9 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com> Jiří Sejkora Joan Fontanals Joan Fontanals +João Dinis Ferreira +Joe Eli McIlvain +Joe Todd Johan Johannes Gäßler Johannes Rudolph @@ -274,7 +344,9 @@ Joyce Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Judd Julius Arkenberg +Jun Hee Yoo Jun Jie <71215065+junnjiee16@users.noreply.github.com> +Junil Kim Junyang Lin Juraj Bednar Justin Parker @@ -292,12 +364,14 @@ Karthik Sethuraman Kasumi <90275229+kasumi-1@users.noreply.github.com> Kawrakow <48489457+ikawrakow@users.noreply.github.com> Keiichi Tabata +Keke Han Kenvix ⭐ Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Kevin Gibbons Kevin Ji <1146876+kevinji@users.noreply.github.com> Kevin Kwok Kevin Lo +Kevin Wang Kolen Cheung Konstantin Herud Konstantin Zhuravlyov @@ -315,22 +389,29 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com> Leonardo Neumann Li Tan Linwei Wang +Liu Jia <109258120+Septa2112@users.noreply.github.com> +Liu Jia LoganDark +Loïc Carrère LostRuins <39025047+LostRuins@users.noreply.github.com> Luciano Luo Tian Lyle Dean +M-A M. Yusuf Sarıgöz +Ma Mingfei Maarten ter Huurne Mack Straight Maël Kerbiriou MaggotHATE +Mahesh Madhav <67384846+heshpdx@users.noreply.github.com> Manuel <44313466+makuche@users.noreply.github.com> Marc Köhlbrugge Marco Matthies <71844+marcom@users.noreply.github.com> Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Marian Cepok Mark Fairbairn +Mark Zhuang Marko Tasic Markus Tavenrath Martin Delille @@ -342,11 +423,15 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com> Mateusz Charytoniuk Matheus C. França Matheus Gabriel Alves Silva +Mathieu Geli Mathieu Nayrolles +Mathijs Henquet Mathijs de Bruin Matt Clayton <156335168+mattjcly@users.noreply.github.com> Matt Pulver +Matt Stephenson Matteo Boschini <12133566+mbosc@users.noreply.github.com> +Matteo Mortari Mattheus Chediak Matthew Tejo Matvey Soloviev @@ -356,8 +441,10 @@ Maxime <672982+maximegmd@users.noreply.github.com> Maximilian Winter Meng Zhang Meng, Hengyu +Mengqing Cao Merrick Christensen Michael Coppola +Michael Francis Michael Hueschen Michael Kesper Michael Klimenko @@ -365,41 +452,57 @@ Michael Podvitskiy Michael Potter Michael de Gans Michaël de Vries +Michał Tuszyński Mihai Mike Mikko Juola Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> +Minsoo Cheong Mirko185 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com> +MistApproach <98988043+MistApproach@users.noreply.github.com> Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Mohammadreza Hendiani Mohammadreza Hendiani +Molly Sophia +MorganRO8 <47795945+MorganRO8@users.noreply.github.com> Murilo Santana Musab Gultekin Nam D. Tran <42194884+namtranase@users.noreply.github.com> Nathan Epstein +Natsu NawafAlansari <72708095+NawafAlansari@users.noreply.github.com> Nebula Neo Zhang <14088817+arthw@users.noreply.github.com> Neo Zhang Neo Zhang Jianyu Neuman Vong +Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Nexesenex <124105151+Nexesenex@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com> +Nicholai Tukanov +Nico Bosshard Nicolai Weitkemper Nicolás Pérez Nigel Bosch Niklas Korz +NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com> Nikolas <127742645+nneubacher@users.noreply.github.com> Nindaleth +OSecret <135510162+OLSecret@users.noreply.github.com> Oleksandr Nikitin Oleksii Maryshchenko Olivier Chafik Ondřej Čertík Ouadie EL FAROUKI +PAB +Pablo Duboue +Pascal Patry Patrice Ferlet Paul Tsochantaris +Pavel Zloi Pavol Rusnak +Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com> Pedro Cuenca Peter Sugihara Phil H <5756783+phiharri@users.noreply.github.com> @@ -407,10 +510,15 @@ Philip Taron Phillip Kravtsov Pierre Alexandre SCHEMBRI Pierrick Hymbert +Pieter Ouwerkerk +Plamen Minev +Prashant Vithule <119530321+Vithulep@users.noreply.github.com> Przemysław Pawełczyk Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> Qingyou Meng Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com> +R0CKSTAR +R0CKSTAR RJ Adriaansen Radoslav Gerganov Radosław Gryta @@ -419,11 +527,13 @@ Raj Hammeer Singh Hada Ralph Soika Rand Xie Randall Fitzgerald +Random Fly Reinforce-II Ren Xuancheng Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> RhinoDevel Riceball LEE +Rich Dougherty Richard Kiss Richard Roberson Rick G <26732651+TheFlipbook@users.noreply.github.com> @@ -439,21 +549,30 @@ Robey Holderith Robyn Roger Meier Roland <14355895+rbur0425@users.noreply.github.com> +Romain Biessy Romain D <90720+Artefact2@users.noreply.github.com> Romain Neutron Roman Parykin Ron Evans Ron Jailall +Roni Ronny Brendel Ronsor Rowan Hart +Ruchira Hasaranga +Ruixin Huang <18860020911@163.com> Rune <43761327+Rune-AI@users.noreply.github.com> +RunningLeon +RunningLeon Ryan Landay Ryder Wishart Ryuei Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> +SRHMorris <69468379+SRHMorris@users.noreply.github.com> +SXX SakuraUmi Salvador E. Tropea +Salvatore Mesoraca Sam Spilsbury Sami Farin <3876865+Safari77@users.noreply.github.com> Samuel Maynard @@ -463,23 +582,29 @@ Sebastián A SebastianApel <13675545+SebastianApel@users.noreply.github.com> Senemu <10880819+Senemu@users.noreply.github.com> Sergey Alirzaev +Sergio López Sergio López Sertaç Özercan <852750+sozercan@users.noreply.github.com> SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> ShadovvBeast Shakhar Dasgupta +Shane A Shangning Xu <32517059+xushangning@users.noreply.github.com> +Shankar +Shanshan Shen <467638484@qq.com> Shijie <821898965@qq.com> Shintarou Okada Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> Shouzheng Liu Shuichi Tsutsumi +Shupei Fan Sigbjørn Skjæret Simon Willison Siwen Yu Sky Yan Slaren <2141330+slaren@users.noreply.github.com> Slava Primenko +Small Grass Forest SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com> Someone Someone Serge @@ -491,12 +616,15 @@ Stefan Sydow Steffen Röcker Stephan Walter Stephen Nichols +Steve Bonds Steve Grubb Steven Prichard Steven Roussey Steward Garcia <57494570+FSSRepo@users.noreply.github.com> +StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> SuperUserNameMan +Sutou Kouhei Tai Duc Nguyen Taikono-Himazin Tameem <113388789+AhmadTameem@users.noreply.github.com> @@ -507,7 +635,9 @@ Theia Vogel Thérence <13496987+Royalphax@users.noreply.github.com> Thibault Terrasson Thomas Klausner +Thorsten Sommer Tim Miller +Tim Wang Timmy Knight Timothy Cronin <40186632+4imothy@users.noreply.github.com> Ting Lou @@ -517,24 +647,31 @@ Tom C Tom Jobbins <784313+TheBloke@users.noreply.github.com> Tomas Tomáš Pazdiora +Tony Wasserka <4840017+neobrain@users.noreply.github.com> Tristan Druyen Tristan Ross +Trivikram Kamat <16024985+trivikr@users.noreply.github.com> Tungsten842 <886724vf@anonaddy.me> Tungsten842 Tushar UEXTM.com <84163508+uextm@users.noreply.github.com> +Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com> Ulrich Drepper Uzo Nweke Vaibhav Srivastav Val Kharitonov Valentin Konovalov Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> +Vali Malinoiu <0x4139@gmail.com> Victor Nogueira Victor Z. Peng +Viet-Anh NGUYEN (Andrew) +Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com> Vlad Vladimir Vladimir Malyutin Vladimir Zorin +VoidIsVoid <343750470@qq.com> Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> Weird Constructor @@ -551,15 +688,22 @@ Xiang (Kevin) Li Xiao-Yong Jin XiaotaoChen Xiaoyi Chen +Xie Yanbo Xingchen Song(宋星辰) +Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com> Xuan Son Nguyen +Yaiko Yann Follet <131855179+YannFollet@users.noreply.github.com> Yaroslav Yazan Agha-Schrader Yiming Cui Yishuo Wang +Yoshi Suhara +Yoshi Suhara +Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> Yui +Yuri Khrustalev Yusuf Kağan Hanoğlu Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> ZHAOKAI WANG @@ -568,6 +712,8 @@ Zay <95888118+isaiahbjork@users.noreply.github.com> Zenix Zhang Peiyuan Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> +Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> +Zhiyuan Li ZhouYuChen Ziad Ben Hadj-Alouane Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> @@ -581,6 +727,7 @@ alexpinel <93524949+alexpinel@users.noreply.github.com> alonfaraj alwqx amd-lalithnc +amritahs-ibm andrijdavid anon998 <131767832+anon998@users.noreply.github.com> anzz1 @@ -588,14 +735,18 @@ apaz apcameron <37645737+apcameron@users.noreply.github.com> arch-btw <57669023+arch-btw@users.noreply.github.com> arcrank +ardfork <134447697+ardfork@users.noreply.github.com> arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com> automaticcat +awatuna <23447591+awatuna@users.noreply.github.com> +b4b4o bandoti <141645996+bandoti@users.noreply.github.com> beiller bhubbb <79117352+bhubbb@users.noreply.github.com> bmwl bobqianic <129547291+bobqianic@users.noreply.github.com> +brucepro bryanSwk <93190252+bryanSwk@users.noreply.github.com> bsilvereagle bssrdf @@ -614,10 +765,14 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com> crasm crasm daboe01 +daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com> +daminho <37615795+daminho@users.noreply.github.com> david raistrick ddh0 ddpasa <112642920+ddpasa@users.noreply.github.com> deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> +devojony <61173062+devojony@users.noreply.github.com> +ditsuke divinity76 dm4 dotpy314 <33351922+dotpy314@users.noreply.github.com> @@ -629,14 +784,18 @@ ebraminio eiery <19350831+eiery@users.noreply.github.com> eric8607242 fairydreaming <166155368+fairydreaming@users.noreply.github.com> +fengerhu1 <2748250768@qq.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com> github-actions[bot] gliptic goerch grahameth <96447521+grahameth@users.noreply.github.com> +gtygo gwjr <502526+gwjr@users.noreply.github.com> h-h-h-h <13482553+h-h-h-h@users.noreply.github.com> hankcs +haopeng <657407891@qq.com> +hipudding hoangmit hongbo.mo <352280764@qq.com> hopkins385 <98618192+hopkins385@users.noreply.github.com> @@ -649,12 +808,14 @@ hxer7963 hydai iSma iacore <74560659+iacore@users.noreply.github.com> +icppWorld <124377669+icppWorld@users.noreply.github.com> igarnier intelmatt <61025942+intelmatt@users.noreply.github.com> iohub jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jameswu2014 <545426914@qq.com> +jdomke <28772296+jdomke@users.noreply.github.com> jiez <373447296@qq.com> jneem joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> @@ -677,28 +838,35 @@ klosax <131523366+klosax@users.noreply.github.com> kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> kunnis kuronekosaiko +kustaaya <58045274+kustaaya@users.noreply.github.com> kuvaus <22169537+kuvaus@users.noreply.github.com> kwin1412 <42286931+kwin1412@users.noreply.github.com> l3utterfly +laik ldwang le.chang leejet +leo-pony limitedAtonement liuwei-git <14815172+liuwei-git@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com> loonerin <132926317+loonerin@users.noreply.github.com> +ltoniazzi <61414566+ltoniazzi@users.noreply.github.com> luoyu-intel m3ndax maddes8cht <55592906+maddes8cht@users.noreply.github.com> makomk manikbhandari maor-ps <154728172+maor-ps@users.noreply.github.com> +matiaslin <45382001+matiaslin@users.noreply.github.com> +matteo mdrokz mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> minarchist mj-shifu <77107165+mj-shifu@users.noreply.github.com> mmyjona momonga <115213907+mmnga@users.noreply.github.com> +momonga <146910567+mmngays@users.noreply.github.com> moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> mzcu nanahi <130121847+na-na-hi@users.noreply.github.com> @@ -716,8 +884,10 @@ omahs <73983677+omahs@users.noreply.github.com> oobabooga <112222186+oobabooga@users.noreply.github.com> opparco ostix360 <55257054+ostix360@users.noreply.github.com> +pculliton pengxin99 perserk +piDack <104877312+piDack@users.noreply.github.com> pmysl postmasters pudepiedj @@ -733,6 +903,7 @@ runfuture sandyiscool sasha0552 semidark +serhii-nakon <57632032+serhii-nakon@users.noreply.github.com> sharpHL <132747147+sharpHL@users.noreply.github.com> shibe2 singularity <12184989+singularity-s0@users.noreply.github.com> @@ -741,42 +912,55 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com> slaren snadampal <87143774+snadampal@users.noreply.github.com> +standby24x7 staviq stduhpf strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com> swittk takov751 <40316768+takov751@users.noreply.github.com> tarcey +tc-mb <157115220+tc-mb@users.noreply.github.com> texmex76 <40733439+texmex76@users.noreply.github.com> thement <40525767+thement@users.noreply.github.com> +thewh1teagle <61390950+thewh1teagle@users.noreply.github.com> tjohnman +toyer <2042519524@qq.com> tslmy ubik2 uint256_t uint256_t unbounded +uvos valiray <133289098+valiray@users.noreply.github.com> +vb vik viric vodkaslime <646329483@qq.com> vvhg1 <94630311+vvhg1@users.noreply.github.com> vxiiduu <73044267+vxiiduu@users.noreply.github.com> +wangshuai09 <391746016@qq.com> wbpxre150 <100937007+wbpxre150@users.noreply.github.com> whoreson <139810751+whoreson@users.noreply.github.com> woachk <24752637+woachk@users.noreply.github.com> wonjun Jang woodx <124784234+woodx9@users.noreply.github.com> +wwoodsTM <104587230+wwoodsTM@users.noreply.github.com> wzy <32936898+Freed-Wu@users.noreply.github.com> xaedes xaedes +xctan xloem <0xloem@gmail.com> yangli2 yuiseki +yuri@FreeBSD zakkor zhangkaihuo +zhentaoyu zhouwg <6889919+zhouwg@users.noreply.github.com> zhouwg zrm Ștefan-Gabriel Muscalu +杨朱 · Kiki 源文雨 <41315874+fumiama@users.noreply.github.com> +蕭澧邦 <45505768+shou692199@users.noreply.github.com> Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c index ced3788790671..69d3d327d1180 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c @@ -1,7 +1,3 @@ -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates -// SPDX-License-Identifier: MIT -// - #define GGML_COMMON_IMPL_C #include "ggml-common.h" From 678d7994f4da0af3d29046be99950ac999ee9762 Mon Sep 17 00:00:00 2001 From: Ting Lou Date: Fri, 29 Nov 2024 08:09:46 +0800 Subject: [PATCH 16/19] llava: return false instead of exit (#10546) --- examples/llava/clip.cpp | 15 +++++++++++---- examples/llava/llava.cpp | 28 +++++++++++++++++++--------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index aae49c965e905..7ba4cea58e80b 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -40,10 +40,17 @@ #include #include -#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#if defined(LLAVA_LOG_OFF) +# define LOG_INF(...) +# define LOG_WRN(...) +# define LOG_ERR(...) +# define LOG_DBG(...) +#else // defined(LLAVA_LOG_OFF) +# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#endif // defined(LLAVA_LOG_OFF) //#define CLIP_DEBUG_FUNCTIONS diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index be69885408433..4ca53a0b883b9 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -11,13 +11,17 @@ #include #include -#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) -#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) - -#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#if defined(LLAVA_LOG_OFF) +# define LOG_INF(...) +# define LOG_WRN(...) +# define LOG_ERR(...) +# define LOG_DBG(...) +#else // defined(LLAVA_LOG_OFF) +# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#endif // defined(LLAVA_LOG_OFF) // RGB uint8 image struct clip_image_u8 { @@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long errno = 0; size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer if (ferror(file)) { - die_fmt("read error: %s", strerror(errno)); + LOG_ERR("read error: %s", strerror(errno)); + free(buffer); + fclose(file); + return false; } if (ret != (size_t) fileSize) { - die("unexpectedly reached end of file"); + LOG_ERR("unexpectedly reached end of file"); + free(buffer); + fclose(file); + return false; } fclose(file); // Close the file From f095a649ec390e04dfab1b04e646ae8549dafaef Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Fri, 29 Nov 2024 00:18:02 -0600 Subject: [PATCH 17/19] vulkan: get the first command buffer submitted sooner (#10499) This is an incremental improvement over #9118 to get work to the GPU a bit sooner. The first part is to start with a smaller number of nodes before the first submit, and ramp it up to the current 100 nodes/submit. The second part is to reduce the dryrun overhead for all the nodes that just need to request descriptor space. With these changes I get around 1-2% speedup on RTX 4070 combined with my old Haswell-era CPU. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 60 ++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index a833007fb3d02..849c119237bba 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5672,6 +5672,48 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } else { compute_ctx = ctx->compute_ctx.lock(); } + } else { + switch (node->op) { + case GGML_OP_REPEAT: + case GGML_OP_ACC: + case GGML_OP_GET_ROWS: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_SIN: + case GGML_OP_COS: + case GGML_OP_CLAMP: + case GGML_OP_PAD: + case GGML_OP_CPY: + case GGML_OP_CONT: + case GGML_OP_DUP: + case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_UNARY: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + case GGML_OP_ARGSORT: + case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_POOL_2D: + case GGML_OP_LEAKY_RELU: + { + // These operations all go through ggml_vk_op_f32, so short-circuit and + // do the only thing needed for the dryrun. + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op); + ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + return false; + } + default: + break; + } } switch (node->op) { @@ -6401,16 +6443,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg bool first_node_in_batch = true; // true if next node will be first node in a batch int submit_node_idx = 0; // index to first node in a batch - // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution - constexpr int submit_count = 100; + // Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution. + // Start with a smaller count to get work submitted right away, and increase it after each submit. + int nodes_per_submit = 20; int submitted_nodes = 0; + int submit_count = 0; for (int i = 0; i < cgraph->n_nodes; i++) { if (first_node_in_batch) { submit_node_idx = i; } - bool submit = (submitted_nodes >= submit_count) || (i == last_node); - + bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node); bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit); @@ -6427,6 +6470,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg if (submit) { first_node_in_batch = true; submitted_nodes = 0; + switch (submit_count) { + case 0: + nodes_per_submit = 50; + break; + default: + nodes_per_submit = 100; + break; + } + submit_count++; } } From 938f6087421889a3af7d0786c64406ced2be81b8 Mon Sep 17 00:00:00 2001 From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com> Date: Fri, 29 Nov 2024 14:46:55 +0800 Subject: [PATCH 18/19] CANN: RoPE operator optimization (#10563) * [cann] RoPE operator optimization * [CANN]Code Formatting --------- Co-authored-by: noemotiovon --- ggml/src/ggml-cann/aclnn_ops.cpp | 241 ++++++++++++++++++++++++++++--- ggml/src/ggml-cann/ggml-cann.cpp | 13 +- 2 files changed, 222 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index d707efc5d1f48..b2d857e1e549b 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2965,7 +2965,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_cos_repeat_tensor, aclTensor* acl_sin_repeat_tensor, float theta_scale, float freq_scale, - bool is_neox) { + float attn_factor, bool is_neox) { // int sin/cos cache, cache has different repeat method depond on // @param.is_neox @@ -3017,6 +3017,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS); aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, nullptr, true); + ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor)); } // position @@ -3047,16 +3048,6 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, acl_theta_tensor); - // // power[] * position[] * freq_scale / freq_factors[] - // ggml_cann_pool_alloc theta_final_allocator(ctx.pool(), - // theta_length * - // sizeof(float_t)); - // aclTensor* acl_theat_final_tensor = aclnn_zero( - // ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length, - // theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t)); - // aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor, - // acl_freq_factors_tensor, freq_scale); - // permute: [0,1,2,3]->[0,2,1,3] int64_t permute_ne[] = {arange_length, 1, position_length, 1}; size_t permute_nb[GGML_MAX_DIMS]; @@ -3092,6 +3083,12 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor); + // attn_factor + if (attn_factor != 1) { + aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true); + aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true); + } + // repeat if (is_neox) { int64_t repeatsArray[] = {1, 1, 1, 2}; @@ -3155,15 +3152,11 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); - // TODO: attn_factor != 1 - GGML_ASSERT(attn_factor == 1); // TODO: n_dims <= ne0 GGML_ASSERT(n_dims == ne0); GGML_ASSERT(n_dims % 2 == 0); // TODO: ext_factor != 0 GGML_ASSERT(ext_factor == 0); - // TODO: type == GGML_TYPE_F16 - GGML_ASSERT(src0->type == GGML_TYPE_F32); const float theta_scale = powf(freq_base, -2.0f / n_dims); @@ -3194,7 +3187,217 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor, - theta_scale, freq_scale, is_neox); + theta_scale, freq_scale, attn_factor, is_neox); + + aclTensor* acl_src = ggml_cann_create_tensor(src0); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + +#ifdef ASCEND_310P + // Special ROPE operation for 310P + + // roll input + void* input_roll_buffer; + aclTensor* acl_minus_one_tensor; + void* minus_one_scale_buffer = nullptr; + ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0)); + ggml_cann_pool_alloc minus_one_scale_allocator( + ctx.pool(), sizeof(float_t) * src0->ne[0]); + if (!is_neox) { + // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...] + input_roll_buffer = roll_allocator.get(); + int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2), + src0->ne[2], src0->ne[3]}; + size_t input_roll_nb[GGML_MAX_DIMS]; + input_roll_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1]; + } + aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( + input_roll_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), input_roll_ne, input_roll_nb, + GGML_MAX_DIMS); + aclTensor* acl_input_tensor = ggml_cann_create_tensor( + src0->data, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), input_roll_ne, input_roll_nb, + GGML_MAX_DIMS); + + int64_t shifts[] = {1}; + int64_t dims[] = {3}; + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); + ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + + // init [-1, 1, -1, 1, ...] + minus_one_scale_buffer = minus_one_scale_allocator.get(); + + int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; + size_t minus_one_nb[GGML_MAX_DIMS]; + minus_one_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; + } + acl_minus_one_tensor = aclnn_values( + ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], + minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); + int64_t dim = 3; + int64_t* index = new int64_t[src0->ne[0]]; + for (int i = 0; i < src0->ne[0]; i++) { + index[i] = i / 2 * 2; + } + int64_t index_num = src0->ne[0]; + float value = -1; + aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, + index_num, value); + } else { + // roll input: [q0,q1,q2,...] -> + // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1] + input_roll_buffer = roll_allocator.get(); + aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( + input_roll_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS); + aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0); + + int64_t shifts[] = {src0->ne[0] / 2}; + int64_t dims[] = {3}; + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); + + ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + // init [-1, -1, -1, 1, 1,1,...] + minus_one_scale_buffer = minus_one_scale_allocator.get(); + int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; + size_t minus_one_nb[GGML_MAX_DIMS]; + minus_one_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; + } + acl_minus_one_tensor = aclnn_values( + ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], + minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); + // -1 * first half + int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1}; + size_t first_half_nb[GGML_MAX_DIMS]; + first_half_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1]; + } + aclTensor* acl_first_half_tensor = ggml_cann_create_tensor( + minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne, + first_half_nb, GGML_MAX_DIMS); + bool inplace = true; + float scale = -1; + aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace); + ACL_CHECK(aclDestroyTensor(acl_first_half_tensor)); + } + + // TODO: n_dims < ne0 + GGML_ASSERT(n_dims == src0->ne[0]); + + // input * scale + ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), + ggml_nbytes(src0)); + void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get(); + size_t input_nb[GGML_MAX_DIMS]; + input_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_nb[i] = input_nb[i - 1] * src0->ne[i - 1]; + } + aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor( + input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); + aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor( + input_roll_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); + + aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, + acl_input_roll_mul_scale_tensor); + + // output + void* output_fp32_buffer; + if (src0->type == GGML_TYPE_F32) { + aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor); + aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, + acl_sin_reshape_tensor); + aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst); + // TODO: ne0 != n_dims in mode2 + } else if (src0->type == GGML_TYPE_F16) { + size_t input_fp32_nb[GGML_MAX_DIMS]; + input_fp32_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; + } + ggml_cann_pool_alloc fp32_allocator1( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* input_fp32_buffer1 = fp32_allocator1.get(); + aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor( + input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + ggml_cann_pool_alloc fp32_allocator2( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* input_fp32_buffer2 = fp32_allocator2.get(); + aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor( + input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + + ggml_cann_pool_alloc fp32_allocator( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + output_fp32_buffer = fp32_allocator.get(); + aclTensor* output_fp32_tensor = ggml_cann_create_tensor( + output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1); + aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, + input_fp32_tensor2); + aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, + output_fp32_tensor); + aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16); + + ACL_CHECK(aclDestroyTensor(input_fp32_tensor1)); + ACL_CHECK(aclDestroyTensor(input_fp32_tensor2)); + ACL_CHECK(aclDestroyTensor(output_fp32_tensor)); + ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src)); + } + return; +#endif + + // src0 == GGML_TYPE_F16 + // TODO: optimization this `if` code + if (src0->type == GGML_TYPE_F16) { + ggml_cann_pool_alloc sin_final_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type)); + ggml_cann_pool_alloc cos_final_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type)); + void* sin_final_buffer = sin_final_allocator.get(); + void* cos_final_buffer = cos_final_allocator.get(); + + int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1}; + size_t sin_final_nb[GGML_MAX_DIMS]; + sin_final_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1]; + } + aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor( + sin_final_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), sin_final_ne, sin_final_nb, + GGML_MAX_DIMS); + aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor( + cos_final_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), sin_final_ne, sin_final_nb, + GGML_MAX_DIMS); + + aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor, + ggml_cann_type_mapping(src0->type)); + aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor, + ggml_cann_type_mapping(src0->type)); + ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); + acl_sin_reshape_tensor = acl_sin_final_tensor; + acl_cos_reshape_tensor = acl_cos_final_tensor; + } uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -3206,10 +3409,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { acl_mode = 1; } - aclTensor* acl_x = ggml_cann_create_tensor(src0); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize( - acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, + acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); @@ -3219,7 +3420,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize, executor, ctx.stream())); - ACL_CHECK(aclDestroyTensor(acl_x)); + ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); ACL_CHECK(aclDestroyTensor(acl_dst)); diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index bcb54e44404db..04e25b8ab1a23 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1739,7 +1739,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_ROPE: { // TODO: with ops-test v == 1 float * ext_factor = (float*)((int32_t*)op->op_params + 7); - float * attn_factor = (float*)((int32_t*)op->op_params + 8); // TODO: n_dims <= ne0 if (op->src[0]->ne[0] != op->op_params[1]) { return false; @@ -1748,17 +1747,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, if (*ext_factor != 0) { return false; } - // TODO: attn_factor != 1 - if (*attn_factor != 1) { - return false; - } - //TODO: type == GGML_TYPE_F16 - switch (op->src[0]->type) { - case GGML_TYPE_F32: - return true; - default: - return false; - } + return true; } case GGML_OP_UPSCALE: { // aclnnUpsampleNearest2dGetWorkspaceSize not support From 266b8519ee6d21e7ba2bf56f5629e20a181fee8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Fri, 29 Nov 2024 09:49:43 +0000 Subject: [PATCH 19/19] sycl : Reroute permuted mul_mats through oneMKL (#10408) This PR fixes the failing MUL_MAT tests for the sycl backend. --- ggml/src/ggml-sycl/ggml-sycl.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index b6392ed8dcc6a..aabcdc22422fc 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3447,8 +3447,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { - // KQ single-batch - ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst); + // TODO: Refactor and cleanup of mul mat dispatching. + if (src0->ne[3] == 1 && src1->ne[3] == 1) { + // KQ single-batch + // mmv p021 was specific for these dimensions + ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst); + } else { + // The kernel from the if path is faster for that specific case, but does not support all mul mats. + ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); + } } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);