From 9c405c9f9a7cfd23511fd6b2de05dc72481119b4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 20 Feb 2024 15:58:27 +0100 Subject: [PATCH 01/51] Server: use llama_chat_apply_template (#5593) * server: use llama_chat_apply_template * server: remove trailing space * server: fix format_chat * server: fix help message Co-authored-by: Georgi Gerganov * server: fix formatted_chat --------- Co-authored-by: Georgi Gerganov --- examples/server/oai.hpp | 6 ++-- examples/server/server.cpp | 17 +++++----- examples/server/utils.hpp | 69 ++++++++++++++++++-------------------- llama.cpp | 2 +- 4 files changed, 45 insertions(+), 49 deletions(-) diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp index 2eca8a9fb4560..ff4ad69943552 100644 --- a/examples/server/oai.hpp +++ b/examples/server/oai.hpp @@ -15,13 +15,11 @@ using json = nlohmann::json; inline static json oaicompat_completion_params_parse( + const struct llama_model * model, const json &body, /* openai api json semantics */ const std::string &chat_template) { json llama_params; - std::string formatted_prompt = chat_template == "chatml" - ? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...) - : format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...) llama_params["__oaicompat"] = true; @@ -34,7 +32,7 @@ inline static json oaicompat_completion_params_parse( // https://platform.openai.com/docs/api-reference/chat/create llama_sampling_params default_sparams; llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["prompt"] = formatted_prompt; + llama_params["prompt"] = format_chat(model, chat_template, body["messages"]); llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["temperature"] = json_value(body, "temperature", 0.0); llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 23482ed95075a..c7821eca68cba 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -37,7 +37,7 @@ struct server_params std::string hostname = "127.0.0.1"; std::vector api_keys; std::string public_path = "examples/server/public"; - std::string chat_template = "chatml"; + std::string chat_template = ""; int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; @@ -1937,8 +1937,9 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); - printf(" --chat-template FORMAT_NAME"); - printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str()); + printf(" --chat-template JINJA_TEMPLATE\n"); + printf(" set custom jinja chat template (default: template taken from model's metadata)\n"); + printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n"); printf("\n"); } @@ -2389,13 +2390,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - std::string value(argv[i]); - if (value != "chatml" && value != "llama2") { - fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str()); + if (!verify_custom_template(argv[i])) { + fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); + fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); invalid_param = true; break; } - sparams.chat_template = value; + sparams.chat_template = argv[i]; } else if (arg == "--override-kv") { @@ -2913,7 +2914,7 @@ int main(int argc, char **argv) if (!validate_api_key(req, res)) { return; } - json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template); + json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template); const int task_id = llama.queue_tasks.get_new_id(); llama.queue_results.add_waiting_task_id(task_id); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 0ee670dbadca5..e954fb0ef0413 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -167,50 +167,47 @@ static T json_value(const json &body, const std::string &key, const T &default_v : default_value; } -inline std::string format_llama2(std::vector messages) -{ - std::ostringstream output; - bool is_inside_turn = false; +// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid +inline bool verify_custom_template(const std::string & tmpl) { + llama_chat_message chat[] = {{"user", "test"}}; + std::vector buf(1); + int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size()); + return res >= 0; +} - for (auto it = messages.begin(); it != messages.end(); ++it) { - if (!is_inside_turn) { - output << "[INST] "; - } - std::string role = json_value(*it, "role", std::string("user")); - std::string content = json_value(*it, "content", std::string("")); - if (role == "system") { - output << "<>\n" << content << "\n<>\n\n"; - is_inside_turn = true; - } else if (role == "user") { - output << content << " [/INST]"; - is_inside_turn = true; - } else { - output << " " << content << " "; - is_inside_turn = false; - } +// Format given chat. If tmpl is empty, we take the template from model metadata +inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) +{ + size_t alloc_size = 0; + // vector holding all allocated string to be passed to llama_chat_apply_template + std::vector str(messages.size() * 2); + std::vector chat(messages.size()); + + for (size_t i = 0; i < messages.size(); ++i) { + auto &curr_msg = messages[i]; + str[i*2 + 0] = json_value(curr_msg, "role", std::string("")); + str[i*2 + 1] = json_value(curr_msg, "content", std::string("")); + alloc_size += str[i*2 + 1].length(); + chat[i].role = str[i*2 + 0].c_str(); + chat[i].content = str[i*2 + 1].c_str(); } - LOG_VERBOSE("format_llama2", {{"text", output.str()}}); + const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); + std::vector buf(alloc_size * 2); - return output.str(); -} - -inline std::string format_chatml(std::vector messages) -{ - std::ostringstream chatml_msgs; + // run the first time to get the total output length + int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); - for (auto it = messages.begin(); it != messages.end(); ++it) { - chatml_msgs << "<|im_start|>" - << json_value(*it, "role", std::string("user")) << '\n'; - chatml_msgs << json_value(*it, "content", std::string("")) - << "<|im_end|>\n"; + // if it turns out that our buffer is too small, we resize it + if ((size_t) res > buf.size()) { + buf.resize(res); + res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); } - chatml_msgs << "<|im_start|>assistant" << '\n'; - - LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}}); + std::string formatted_chat(buf.data(), res); + LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); - return chatml_msgs.str(); + return formatted_chat; } // diff --git a/llama.cpp b/llama.cpp index 5de07dfa999a7..4296eca3261e8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12602,7 +12602,7 @@ LLAMA_API int32_t llama_chat_apply_template( // load template from model std::vector model_template(2048, 0); // longest known template is about 1200 bytes std::string template_key = "tokenizer.chat_template"; - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size()); + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); if (res < 0) { // worst case: there is no information about template, we will use chatml by default curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal From 4ed8e4fbef6a15afd993bfcd9ffa279841e18ef1 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 20 Feb 2024 18:30:27 +0100 Subject: [PATCH 02/51] llava : add explicit instructions for llava-1.6 (#5611) This commit contains a suggestion for the README.md in the llava example. The suggestion adds explicit instructions for how to convert a llava-1.6 model and run it using llava-cli. The motivation for this is that having explicit instructions similar to the 1.5 instructions will make it easier for users to try this out. Signed-off-by: Daniel Bevenius --- examples/llava/README.md | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index e42db6e5ad3cf..25ea967153510 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -59,14 +59,40 @@ python ./convert.py ../llava-v1.5-7b --skip-unknown Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory. ## LLaVA 1.6 gguf conversion - -1) Backup your pth/safetensor model files as llava-surgery modifies them -2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models: +1) First clone a LLaVA 1.6 model: +```console +git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b +``` +2) Backup your pth/safetensor model files as llava-surgery modifies them +3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: +```console +python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ +``` - you will find a llava.projector and a llava.clip file in your model directory -3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config_vit.json) and rename it to config.json. -4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip-model-is-vision` +4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: +```console +mkdir vit +cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin +cp ../llava-v1.6-vicuna-7b/llava.projector vit/ +curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json +``` + +5) Create the visual gguf model: +```console +python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision +``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP -5) Everything else as usual: convert.py the hf model, quantize as needed + +6) Then convert the model to gguf format: +```console +python ./convert.py ../llava-v1.6-vicuna-7b/ +``` + +7) And finally we can run the llava-cli using the 1.6 model version: +```console +./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 +``` + **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) **note** llava-1.6 greatly benefits from batched prompt processing (defaults work) From 06bf2cf8c406e6b70dbf9b431a02fa0ad845b9df Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 20 Feb 2024 20:06:17 +0100 Subject: [PATCH 03/51] make : fix debug build with CUDA (#5616) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 211a08d7ff186..41c79c135467a 100644 --- a/Makefile +++ b/Makefile @@ -173,7 +173,7 @@ ifdef LLAMA_DEBUG MK_LDFLAGS += -g ifeq ($(UNAME_S),Linux) - MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS + MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS endif else MK_CPPFLAGS += -DNDEBUG From 6560bed3f066c876682464762cad90f1e28e3f1b Mon Sep 17 00:00:00 2001 From: CJ Pais Date: Tue, 20 Feb 2024 11:07:22 -0800 Subject: [PATCH 04/51] server : support llava 1.6 (#5553) * server: init working 1.6 * move clip_image to header * remove commented code * remove c++ style from header * remove todo * expose llava_image_embed_make_with_clip_img * fix zig build --- Makefile | 2 +- build.zig | 3 ++- examples/llava/llava.cpp | 2 +- examples/llava/llava.h | 2 ++ examples/server/server.cpp | 36 +++--------------------------------- 5 files changed, 9 insertions(+), 36 deletions(-) diff --git a/Makefile b/Makefile index 41c79c135467a..f03faf6eda0fb 100644 --- a/Makefile +++ b/Makefile @@ -719,7 +719,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2) diff --git a/build.zig b/build.zig index 699738f3dd509..c0af454dc9e92 100644 --- a/build.zig +++ b/build.zig @@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void { const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); const train = make.obj("train", "common/train.cpp"); const clip = make.obj("clip", "examples/llava/clip.cpp"); + const llava = make.obj("llava", "examples/llava/llava.cpp"); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); @@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void { _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4cb65a07b6740..1a1cf7c78bf34 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -311,7 +311,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * return true; } -static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { +bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model if (!image_embd) { fprintf(stderr, "Unable to allocate memory for image embeddings\n"); diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 9e9466a5d1726..2d40f3f1d5f84 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -31,6 +31,8 @@ struct llava_image_embed { /** sanity check for clip <-> llava embed size match */ LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); +LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); + /** build an image embed from image file bytes */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); /** build an image embed from a path to an image filename */ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c7821eca68cba..eb01729fa7a8a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,7 @@ #include "oai.hpp" #include "../llava/clip.h" +#include "../llava/llava.h" #include "stb_image.h" @@ -997,43 +998,12 @@ struct llama_server_context { continue; } - clip_image_f32_batch img_res_v; - img_res_v.size = 0; - img_res_v.data = nullptr; - if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v)) - { - LOG_TEE("Error processing the given image"); - clip_free(clp_ctx); - clip_image_f32_batch_free(img_res_v); - return false; - } - if (img_res_v.size == 0) - { - LOG_TEE("Error processing the given image"); - return false; - } - - // note: assumes only one image was returned by clip_image_preprocess - clip_image_f32 * img_res = img_res_v.data; - img.image_tokens = clip_n_patches(clp_ctx); - img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); - if (!img.image_embedding) - { - LOG_TEE("Unable to allocate memory for image embeddings\n"); - clip_image_f32_batch_free(img_res_v); - clip_free(clp_ctx); - return false; - } - LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); - if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding)) - { - LOG_TEE("Unable to encode image\n"); - clip_image_f32_batch_free(img_res_v); + if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { + LOG_TEE("Error processing the given image"); return false; } - clip_image_f32_batch_free(img_res_v); img.request_encode_image = false; } From a14679cc30c785e75d38028bae6ec39c6209ddef Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:39:52 +0200 Subject: [PATCH 05/51] IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590) * iq4_nl: squash commits for easier rebase * Basics (quantize, dequantize) * CUDA dequantize and dot product * Slightly faster CUDA dot product (120 t/s) * Switch to 6-bit scales * Scalar dot product * AVX2 dot product * ARM_NEON dot product * Works on metal, but still slow * Slightly better Metal dot product * Another small Metal improvement * Metal dot product is getting there * Faster CUDA dot product * Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided * Report the actual bpw * Add _xs mix that is 4.05 bpw for non-MoE models * Remove IQ4_XS for now, slightly adjust kvalues_iq4nl * AVX2 dot product uses Q8_0 instead of Q8_K * Add to test-backend-ops * Minor fix * Also use use Q5_K for attn_output in MoE models * Fixes after merging latest master * Switching to blocks of 32 * AVX2 for blocks of 32 * Scaler dot product for blocks of 32 * ARM_NEON dot product for blocks of 32 * Metal kernels for blocks of 32 * Slightly faster Metal kernels * iq4_nl: Fix after merging with master * iq4_nl: another fix after merging with master * Use IQ4_NL instead of Q4_K when using k-quants is not possible * Fix typo that makes several tests fail * It was the ggml_vdotq thing missed inside the brackets --------- Co-authored-by: Iwan Kawrakow --- examples/quantize/quantize.cpp | 1 + ggml-cuda.cu | 98 +++++++++++++- ggml-metal.m | 35 +++++ ggml-metal.metal | 215 +++++++++++++++++++++++++++++- ggml-quants.c | 234 ++++++++++++++++++++++++++++++++- ggml-quants.h | 13 ++ ggml.c | 30 +++++ ggml.h | 2 + llama.cpp | 17 ++- llama.h | 1 + tests/test-backend-ops.cpp | 1 + 11 files changed, 640 insertions(+), 7 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index ea7ba50c96eb6..37520857f99f7 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -32,6 +32,7 @@ static const std::vector QUANT_OPTIONS = { { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, + { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.25 bpw non-linear quantization", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 6caae56b0244b..e7c211d7d6087 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -528,6 +528,15 @@ typedef struct { } block_iq1_s; static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); +#define QK4_NL 32 +#define QR4_NL 2 +#define QI4_NL (QK4_NL / (4*QR4_NL)) +typedef struct { + half d; + uint8_t qs[QK4_NL/2]; +} block_iq4_nl; +static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); + #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -1987,6 +1996,26 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_ } +static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + +template +static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); + + const int tid = threadIdx.x; + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } + +} static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { @@ -4732,6 +4761,56 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( #endif } +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values, + int & val1, int & val2) { + + uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; + aux32 = q4 & 0x0f0f0f0f; + uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); + uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); + val1 = v1 | (v2 << 16); + aux32 = (q4 >> 4) & 0x0f0f0f0f; + v1 = values[q8[0]] | (values[q8[1]] << 8); + v2 = values[q8[2]] | (values[q8[3]] << 8); + val2 = v1 | (v2 << 16); +} +#endif + +static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_iq4_nl * bq = (const block_iq4_nl *) vbq; + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; + const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; + + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { + const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); + get_int_from_table_16(aux, values, v1, v2); + sumi1 = __dp4a(v1, q8[l+0], sumi1); + sumi2 = __dp4a(v2, q8[l+4], sumi2); + } + +#else + const uint8_t * q4 = bq->qs + 4*iqs; + const int8_t * q8 = bq8_1->qs + 4*iqs; + + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) { + sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf]; + sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4]; + } +#endif + const float d = (float)bq->d * __low2float(bq8_1->ds); + return d * (sumi1 + sumi2); +} + template static __device__ __forceinline__ void mul_mat_q( @@ -6777,6 +6856,12 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, c dequantize_block_iq1_s<<>>(vx, y); } +template +static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_iq4_nl<<>>(vx, y); +} + template static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; @@ -6818,6 +6903,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq3_xxs_cuda; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_cuda; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_cuda; case GGML_TYPE_F32: return convert_unary_cuda; default: @@ -6855,6 +6942,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq3_xxs_cuda; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_cuda; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -8599,6 +8688,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_RDNA2 ? 128 : 64; default: GGML_ASSERT(false); @@ -8623,6 +8713,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q6_K: return 64; @@ -8724,6 +8815,10 @@ static void ggml_cuda_op_mul_mat_vec_q( mul_mat_vec_q_cuda (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; + case GGML_TYPE_IQ4_NL: + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; default: GGML_ASSERT(false); break; @@ -11446,7 +11541,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S) { + if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || + a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) { if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; } diff --git a/ggml-metal.m b/ggml-metal.m index 956e323a0d053..0d4aa43093739 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -62,6 +62,7 @@ GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, GGML_METAL_KERNEL_TYPE_RMS_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM, @@ -85,6 +86,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, @@ -104,6 +106,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, @@ -120,6 +123,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, @@ -136,6 +140,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32, GGML_METAL_KERNEL_TYPE_ROPE_F16, GGML_METAL_KERNEL_TYPE_ALIBI_F32, @@ -448,6 +453,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); @@ -471,6 +477,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); @@ -490,6 +497,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); @@ -506,6 +514,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); @@ -522,6 +531,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); @@ -1338,6 +1348,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } @@ -1478,6 +1489,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; } break; + case GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); @@ -1525,6 +1542,11 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src0t == GGML_TYPE_IQ4_NL) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1619,6 +1641,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } @@ -1762,6 +1785,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; } break; + case GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); @@ -1825,6 +1854,11 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src2t == GGML_TYPE_IQ4_NL) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src2t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1867,6 +1901,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml-metal.metal b/ggml-metal.metal index f0d77d446755c..c223a981c246a 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2531,6 +2531,12 @@ typedef struct { uint8_t scales[QK_K/16]; } block_iq1_s; +// Non-linear quants +#define QK4_NL 32 +typedef struct { + half d; + uint8_t qs[QK4_NL/2]; +} block_iq4_nl; //====================================== dot products ========================= @@ -4384,7 +4390,6 @@ void kernel_mul_mv_iq1_s_f32_impl( const uint i13 = im/ne12; const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; @@ -4447,6 +4452,103 @@ void kernel_mul_mv_iq1_s_f32_impl( } } +constexpr constant static float kvalues_iq4nl_f[16] = { + -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f +}; + +void kernel_mul_mv_iq4_nl_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup float * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK4_NL; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + const int first_row = (r0 * 2 + sgitg) * 2; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + const int ix = tiisg/2; // 0...15 + const int it = tiisg%2; // 0 or 1 + + shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + float4 yl[4]; + float sumf[2]={0.f}, all_sum; + + device const float * yb = y + ix * QK4_NL + it * 8; + + uint32_t aux32[2]; + thread const uint8_t * q8 = (thread const uint8_t *)aux32; + + float4 qf1, qf2; + + for (int ib = ix; ib < nb; ib += 16) { + + device const float4 * y4 = (device const float4 *)yb; + yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5]; + + for (int row = 0; row < 2; ++row) { + + device const block_iq4_nl & xb = x[row*nb + ib]; + device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it); + + float4 acc1 = {0.f}, acc2 = {0.f}; + + aux32[0] = q4[0] | (q4[1] << 16); + aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; + aux32[0] &= 0x0f0f0f0f; + qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]}; + qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]}; + acc1 += yl[0] * qf1; + acc2 += yl[1] * qf2; + + aux32[0] = q4[2] | (q4[3] << 16); + aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; + aux32[0] &= 0x0f0f0f0f; + qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]}; + qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]}; + acc1 += yl[2] * qf1; + acc2 += yl[3] * qf2; + + acc1 += acc2; + + sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]); + + } + + yb += 16 * QK4_NL; + } + + for (int row = 0; row < 2; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; + } + } +} + [[host_name("kernel_mul_mv_iq1_s_f32")]] kernel void kernel_mul_mv_iq1_s_f32( device const void * src0, @@ -4475,6 +4577,34 @@ kernel void kernel_mul_mv_iq1_s_f32( kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); } +[[host_name("kernel_mul_mv_iq4_nl_f32")]] +kernel void kernel_mul_mv_iq4_nl_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup float * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} //============================= templates and their specializations ============================= @@ -4838,6 +4968,21 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & } } +template +void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { + device const uint16_t * q4 = (device const uint16_t *)xb->qs; + const float d = xb->d; + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + for (int i = 0; i < 4; ++i) { + aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f; + reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; + reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; + reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; + reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; + } +} + template kernel void kernel_get_rows( device const void * src0, @@ -5381,6 +5526,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; // // matrix-matrix multiplication @@ -5421,6 +5567,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication @@ -5473,6 +5620,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; // // matrix-vector multiplication @@ -6503,3 +6651,68 @@ kernel void kernel_mul_mv_id_iq1_s_f32( tiisg, sgitg); } + +[[host_name("kernel_mul_mv_id_iq4_nl_f32")]] +kernel void kernel_mul_mv_id_iq4_nl_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup float * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq4_nl_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} diff --git a/ggml-quants.c b/ggml-quants.c index 3319d2ccf1812..6336538f0e99e 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3754,6 +3754,26 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in } } +static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + +void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) { + assert(k % QK4_NL == 0); + const int nb = k / QK4_NL; + + for (int i = 0; i < nb; i++) { + + const uint8_t * qs = x[i].qs; + + const float d = GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK4_NL/2; ++j) { + y[j+ 0] = d * kvalues_iq4nl[qs[j] & 0xf]; + y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >> 4]; + } + y += QK4_NL; + qs += QK4_NL/2; + } +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { @@ -9148,7 +9168,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * #endif } -// TODO void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -9452,7 +9471,100 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const *s = sumf; #endif +} + +void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_iq4_nl * restrict x = vx; + const block_q8_0 * restrict y = vy; + + const int nb = n / QK4_NL; + +#if defined __ARM_NEON + const int8x16_t values = vld1q_s8(kvalues_iq4nl); + const uint8x16_t m4b = vdupq_n_u8(0x0f); + uint8x16x2_t q4bits; + int8x16x4_t q4b; + int8x16x4_t q8b; + int32x4_t prod_1, prod_2; + float sumf = 0; + + for (int ib = 0; ib < nb; ib += 2) { + + q4bits.val[0] = vld1q_u8(x[ib+0].qs); + q4bits.val[1] = vld1q_u8(x[ib+1].qs); + q8b.val[0] = vld1q_s8(y[ib+0].qs); + q8b.val[1] = vld1q_s8(y[ib+0].qs + 16); + q8b.val[2] = vld1q_s8(y[ib+1].qs); + q8b.val[3] = vld1q_s8(y[ib+1].qs + 16); + + q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b)); + q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); + q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b)); + q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); + + prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); + prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); + + sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2); + + } + + *s = sumf; + +#elif defined __AVX2__ + + const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); + const __m128i m4b = _mm_set1_epi8(0x0f); + const __m256i mone = _mm256_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (int ib = 0; ib < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs); + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs); + const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); + const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); + const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); + accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)), + _mm256_cvtepi32_ps(p_1), accum1); + accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)), + _mm256_cvtepi32_ps(p_2), accum2); + + y += 2; + x += 2; + } + + *s = hsum_float_8(_mm256_add_ps(accum1, accum2)); + +#else + float sumf = 0; + for (int ib = 0; ib < nb; ++ib) { + const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +#endif } // ================================ IQ2 quantization ============================================= @@ -10729,3 +10841,123 @@ size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, in } return nrow * nblock * sizeof(block_iq1_s); } + +// ============================ 4-bit non-linear quants + +static inline int best_index_int8(int n, const int8_t * val, float x) { + if (x <= val[0]) return 0; + if (x >= val[n-1]) return n-1; + int ml = 0, mu = n-1; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < val[mav]) mu = mav; else ml = mav; + } + return x - val[mu-1] < val[mu] - x ? mu-1 : mu; +} + +static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x, + ggml_fp16_t * dh, uint8_t * q4, + float * weight, uint8_t * L, + const int8_t * values, + const float * quant_weights) { + + const int ntry = 7; + + float sigma2 = 0; + for (int j = 0; j < QK4_NL; ++j) sigma2 += x[j]*x[j]; + sigma2 *= 2.f/QK4_NL; + + const int nb = QK4_NL/block_size; + + memset(q4, 0, QK4_NL/2); + for (int ib = 0; ib < nb; ++ib) { + dh[ib] = GGML_FP32_TO_FP16(0.f); + const float * xb = x + ib*block_size; + if (quant_weights) { + const float * qw = quant_weights + ib*block_size; + for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < block_size; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (!amax) { + continue; + } + float d = -max/values[0]; + float id = 1/d; + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < block_size; ++j) { + float al = id*xb[j]; + int l = best_index_int8(16, values, al); + float q = values[l]; + float w = weight[j]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + float best_id = id; + d = sumqx/sumq2; + float best = d*sumqx; + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry + values[0])/max; + sumqx = sumq2 = 0; + for (int j = 0; j < block_size; ++j) { + float al = id*xb[j]; + int l = best_index_int8(16, values, al); + float q = values[l]; + float w = weight[j]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d * sumqx; + best_id = id; + } + } + dh[ib] = GGML_FP32_TO_FP16(d); + for (int j = 0; j < block_size; ++j) { + L[ib*block_size + j] = best_index_int8(16, values, best_id*xb[j]); + } + } + for (int i = 0; i < QK4_NL/32; ++i) { + for (int j = 0; j < 16; ++j) { + q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4); + } + } +} + +size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + GGML_ASSERT(n_per_row%QK4_NL == 0); + int nblock = n_per_row/QK4_NL; + char * qrow = (char *)dst; + uint8_t L[QK4_NL]; + float weight[32]; + for (int row = 0; row < nrow; ++row) { + block_iq4_nl * iq4 = (block_iq4_nl *)qrow; + for (int ibl = 0; ibl < nblock; ++ibl) { + const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; + quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, weight, L, kvalues_iq4nl, qw); + } + src += n_per_row; + qrow += nblock*sizeof(block_iq4_nl); + } + return nrow * nblock * sizeof(block_iq4_nl); +} + +void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { + assert(k % QK4_NL == 0); + block_iq4_nl * restrict y = vy; + quantize_row_iq4_nl_reference(x, y, k); +} + +void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { + assert(k % QK4_NL == 0); + quantize_iq4_nl(x, y, 1, k, NULL, NULL); +} + diff --git a/ggml-quants.h b/ggml-quants.h index ad381cfab022d..113623b62938a 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -198,6 +198,14 @@ typedef struct { } block_iq1_s; static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); +// Non-linear quants +#define QK4_NL 32 +typedef struct { + ggml_fp16_t d; + uint8_t qs[QK4_NL/2]; +} block_iq4_nl; +static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); + #ifdef __cplusplus extern "C" { #endif @@ -217,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k); void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); +void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); @@ -232,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); @@ -251,6 +261,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -268,6 +279,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") @@ -276,6 +288,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); diff --git a/ggml.c b/ggml.c index d129df50548a3..91adbb0aee6bc 100644 --- a/ggml.c +++ b/ggml.c @@ -690,6 +690,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ4_NL] = { + .type_name = "iq4_nl", + .blck_size = QK4_NL, + .type_size = sizeof(block_iq4_nl), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq4_nl, + .from_float = quantize_row_iq4_nl, + .from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference, + .vec_dot = ggml_vec_dot_iq4_nl_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, @@ -2291,6 +2303,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break; case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break; + case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7702,6 +7715,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: { ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; @@ -7970,6 +7984,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: { ggml_compute_forward_add1_q_f32(params, src0, src1, dst); } break; @@ -8091,6 +8106,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: default: { GGML_ASSERT(false); @@ -10858,6 +10874,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: { ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; @@ -11039,6 +11056,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: default: { GGML_ASSERT(false); @@ -11237,6 +11255,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -11911,6 +11930,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -11989,6 +12009,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -19455,6 +19476,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; + case GGML_TYPE_IQ4_NL: + { + GGML_ASSERT(start % QK4_NL == 0); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); + } break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/ggml.h b/ggml.h index 004d09c700f32..bed7a36a0ee6a 100644 --- a/ggml.h +++ b/ggml.h @@ -355,6 +355,7 @@ extern "C" { GGML_TYPE_IQ2_XS = 17, GGML_TYPE_IQ3_XXS = 18, GGML_TYPE_IQ1_S = 19, + GGML_TYPE_IQ4_NL = 20, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -393,6 +394,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors }; // available tensor operations: diff --git a/llama.cpp b/llama.cpp index 4296eca3261e8..3748d5eac8a50 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2527,6 +2527,7 @@ struct llama_model_loader { case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; + case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -2877,6 +2878,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; default: return "unknown, may not work"; } @@ -10354,6 +10356,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) { + new_type = GGML_TYPE_Q5_K; + } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; @@ -10406,6 +10411,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; } } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) { + if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { new_type = GGML_TYPE_Q5_K; @@ -10422,7 +10430,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { new_type = GGML_TYPE_Q5_K; } @@ -10489,8 +10497,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: - case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; - case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break; case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; @@ -10531,7 +10539,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break; + case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } diff --git a/llama.h b/llama.h index 77a84c18a69cf..8ba20696f8af9 100644 --- a/llama.h +++ b/llama.h @@ -101,6 +101,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index ef37c5af275fe..55db42bf6e851 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1918,6 +1918,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, + GGML_TYPE_IQ4_NL, }; // unary ops From 88c46cbdac05cebd936511b1d3c74112e721615f Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Wed, 21 Feb 2024 17:52:06 +0800 Subject: [PATCH 06/51] [SYCL] conext add name (#5624) * [SYCL] conext add name * name should start with SYCL* --- ggml-sycl.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index df182611241b0..b897828f967b1 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -14642,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { - int device = (int) (intptr_t) buft->context; + ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; + int device = (int) buft_ctx->device; ggml_sycl_set_device(device); int device_index = get_device_index_by_id(device); @@ -14720,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) { ggml_backend_sycl_buffer_types[i] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, - /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, + /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)}, }; } ggml_backend_sycl_buffer_type_initialized = true; @@ -14782,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { // backend -struct ggml_backend_context_sycl { - int device; -}; - static const char * ggml_backend_sycl_name(ggml_backend_t backend) { return GGML_SYCL_NAME; @@ -14793,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) { } static void ggml_backend_sycl_free(ggml_backend_t backend) { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; delete sycl_ctx; delete backend; } static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; return ggml_backend_sycl_buffer_type(sycl_ctx->device); } @@ -14809,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -14827,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -14842,7 +14839,7 @@ catch (sycl::exception const &exc) { } static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait())); @@ -14878,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba } static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_sycl_set_main_device(sycl_ctx->device); @@ -15092,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) { // not strictly necessary, but it may reduce the overhead of the first graph_compute ggml_sycl_set_main_device(device); - ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl { - /* .device = */ device + ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context { + /* .device = */ device, + /* .name = */ GGML_SYCL_NAME + std::to_string(device), }; ggml_backend_t sycl_backend = new ggml_backend { From 580111d42b3b6ad0a390bfb267d6e3077506eb31 Mon Sep 17 00:00:00 2001 From: postmasters Date: Wed, 21 Feb 2024 05:08:22 -0800 Subject: [PATCH 07/51] llama : add `gemma` model (#5631) There are couple things in this architecture: 1. Shared input and output embedding parameters. 2. Key length and value length are not derived from `n_embd`. More information about the models can be found at https://ai.google.dev/gemma. GGUFs can be downloaded from https://huggingface.co/google. --- README.md | 1 + gguf-py/gguf/constants.py | 15 ++++ llama.cpp | 170 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 186 insertions(+) diff --git a/README.md b/README.md index 747d2e98b5a94..225db8e49ce39 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,7 @@ Typically finetunes of the base models below are supported as well. - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) - [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [CodeShell](https://github.com/WisdomShell/codeshell) +- [x] [Gemma](https://ai.google.dev/gemma) **Multimodal models:** diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 114a9a9743081..8f9139d1b7eca 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -111,6 +111,7 @@ class MODEL_ARCH(IntEnum): ORION = auto() INTERNLM2 = auto() MINICPM = auto() + GEMMA = auto() class MODEL_TENSOR(IntEnum): @@ -167,6 +168,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.GEMMA: "gemma", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -511,6 +513,19 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.GEMMA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_NORM, + ], # TODO } diff --git a/llama.cpp b/llama.cpp index 3748d5eac8a50..3a226c4260c0b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -208,6 +208,7 @@ enum llm_arch { LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, + LLM_ARCH_GEMMA, LLM_ARCH_UNKNOWN, }; @@ -234,6 +235,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_GEMMA, "gemma" }, }; enum llm_kv { @@ -760,6 +762,22 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, }, }, + { + LLM_ARCH_GEMMA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -3243,6 +3261,16 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GEMMA: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 18: model.type = e_model::MODEL_2B; break; + case 28: model.type = e_model::MODEL_7B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -4360,6 +4388,37 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_GEMMA: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + + const int64_t n_ff = hparams.n_ff; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + for (uint32_t i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -7366,6 +7425,113 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_gemma() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, + n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); + cb(Qcur, "Qcur_scaled", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, + n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + cb(cur, "kqv_out", il); + } + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + cb(sa_out, "sa_out", il); + + cur = llm_build_norm(ctx0, sa_out, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, sa_out); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph( @@ -7474,6 +7640,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_minicpm(); } break; + case LLM_ARCH_GEMMA: + { + result = llm.build_gemma(); + } break; default: GGML_ASSERT(false); } From cc6cac08e38e32bf40bbe07e9e8f8f0130b5fd94 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 21 Feb 2024 14:36:57 +0100 Subject: [PATCH 08/51] llava : add --skip-unknown to 1.6 convert.py (#5632) This commit adds the `--skip-unknown` option to the convert.py script and removes the saving of the updated checkpoints to avoid updating possibly checked out files. The motivation for this change is that this was done for 1.5 in Commit fc0c8d286a533363a9a663510b62af85ffad58b3 ("llava : update surgery script to not remove tensors") and makes the examples more consistent. Signed-off-by: Daniel Bevenius --- examples/llava/README.md | 13 ++++++------- examples/llava/llava-surgery-v2.py | 12 ------------ 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index 25ea967153510..35e6d9e5d00f7 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -63,13 +63,12 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director ```console git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b ``` -2) Backup your pth/safetensor model files as llava-surgery modifies them -3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: +2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: ```console python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ ``` - you will find a llava.projector and a llava.clip file in your model directory -4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: +3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: ```console mkdir vit cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin @@ -77,18 +76,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json ``` -5) Create the visual gguf model: +4) Create the visual gguf model: ```console python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision ``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP -6) Then convert the model to gguf format: +5) Then convert the model to gguf format: ```console -python ./convert.py ../llava-v1.6-vicuna-7b/ +python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` -7) And finally we can run the llava-cli using the 1.6 model version: +6) And finally we can run the llava-cli using the 1.6 model version: ```console ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ``` diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index 5bc5bc5137fe0..eb56d6988ac26 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -65,9 +65,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): for name in clip_tensors: del checkpoint[name] - # Save the updated checkpoint checkpoint_path = checkpoint_path - save_model(checkpoint, checkpoint_path, file_type) return True return False @@ -152,16 +150,6 @@ def proj_criteria(checkpoint): if len(projector) > 0: save_model(projector, f"{args.model}/llava.projector", 'pytorch') -for name in mm_tensors: - del last_checkpoint[name] -for name in first_mm_tensors: - del first_checkpoint[name] - -if len(mm_tensors) > 0: - save_model(last_checkpoint, projector_checkpoint_path, file_type) -if len(first_mm_tensors) > 0: - save_model(first_checkpoint, newline_checkpoint_path, file_type) - print("Done!") print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") From c14f72db9c62d71d35eb1c141745c0bd0cb27b49 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 21 Feb 2024 15:39:54 +0200 Subject: [PATCH 09/51] readme : update hot topics --- README.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 225db8e49ce39..ce5dec7caeac9 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics -- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240 -- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138 - - [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series) -- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow -- Collecting Apple Silicon performance stats: - - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167 - - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508 +- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631 +- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 ---- From eccd7a26ddbff19e4b8805648f5f14c501957859 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 21 Feb 2024 16:17:10 +0200 Subject: [PATCH 10/51] sync : ggml (#5633) * ggml : fix conv_2d batch mode (ggml/737) Co-authored-by: bssrdf * ggml : compute forward no longer pass src tensors (ggml/729) * sync : ggml ggml-ci --------- Co-authored-by: bssrdf Co-authored-by: bssrdf --- ggml.c | 1150 +++++++++++++++++++++++++--------------- scripts/sync-ggml.last | 2 +- 2 files changed, 711 insertions(+), 441 deletions(-) diff --git a/ggml.c b/ggml.c index 91adbb0aee6bc..5b9fa741a6479 100644 --- a/ggml.c +++ b/ggml.c @@ -5644,7 +5644,9 @@ struct ggml_tensor * ggml_conv_2d( ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW] ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW] - result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW] + result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW] + result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW] + return result; } @@ -6650,8 +6652,10 @@ void ggml_set_param( static void ggml_compute_forward_dup_same_cont( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == dst->type); @@ -6682,8 +6686,10 @@ static void ggml_compute_forward_dup_same_cont( } static void ggml_compute_forward_dup_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -6696,7 +6702,7 @@ static void ggml_compute_forward_dup_f16( const int nth = params->nth; // number of threads if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -6953,8 +6959,10 @@ static void ggml_compute_forward_dup_f16( static void ggml_compute_forward_dup_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -6967,7 +6975,7 @@ static void ggml_compute_forward_dup_f32( const int nth = params->nth; // number of threads if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -7203,8 +7211,10 @@ static void ggml_compute_forward_dup_f32( // A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. static void ggml_compute_forward_dup_bytes( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(src0->type == dst->type); @@ -7213,7 +7223,7 @@ static void ggml_compute_forward_dup_bytes( } if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -7352,21 +7362,23 @@ static void ggml_compute_forward_dup_bytes( static void ggml_compute_forward_dup( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (src0->type == dst->type) { - ggml_compute_forward_dup_bytes(params, src0, dst); + ggml_compute_forward_dup_bytes(params, dst); return; } switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_dup_f16(params, src0, dst); + ggml_compute_forward_dup_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_dup_f32(params, src0, dst); + ggml_compute_forward_dup_f32(params, dst); } break; default: { @@ -7379,9 +7391,11 @@ static void ggml_compute_forward_dup( static void ggml_compute_forward_add_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7467,9 +7481,11 @@ static void ggml_compute_forward_add_f32( static void ggml_compute_forward_add_f16_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7544,9 +7560,11 @@ static void ggml_compute_forward_add_f16_f32( static void ggml_compute_forward_add_f16_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7598,9 +7616,11 @@ static void ggml_compute_forward_add_f16_f16( static void ggml_compute_forward_add_q_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7676,14 +7696,16 @@ static void ggml_compute_forward_add_q_f32( static void ggml_compute_forward_add( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + switch (src0->type) { case GGML_TYPE_F32: { if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f32(params, src0, src1, dst); + ggml_compute_forward_add_f32(params, dst); } else { GGML_ASSERT(false); @@ -7692,10 +7714,10 @@ static void ggml_compute_forward_add( case GGML_TYPE_F16: { if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + ggml_compute_forward_add_f16_f16(params, dst); } else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + ggml_compute_forward_add_f16_f32(params, dst); } else { GGML_ASSERT(false); @@ -7717,7 +7739,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_add_q_f32(params, src0, src1, dst); + ggml_compute_forward_add_q_f32(params, dst); } break; default: { @@ -7730,9 +7752,11 @@ static void ggml_compute_forward_add( static void ggml_compute_forward_add1_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7782,9 +7806,11 @@ static void ggml_compute_forward_add1_f32( static void ggml_compute_forward_add1_f16_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7832,9 +7858,11 @@ static void ggml_compute_forward_add1_f16_f32( static void ggml_compute_forward_add1_f16_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7882,9 +7910,11 @@ static void ggml_compute_forward_add1_f16_f16( static void ggml_compute_forward_add1_q_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7949,21 +7979,23 @@ static void ggml_compute_forward_add1_q_f32( static void ggml_compute_forward_add1( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_add1_f32(params, src0, src1, dst); + ggml_compute_forward_add1_f32(params, dst); } break; case GGML_TYPE_F16: { if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add1_f16_f16(params, src0, src1, dst); + ggml_compute_forward_add1_f16_f16(params, dst); } else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add1_f16_f32(params, src0, src1, dst); + ggml_compute_forward_add1_f16_f32(params, dst); } else { GGML_ASSERT(false); @@ -7986,7 +8018,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_add1_q_f32(params, src0, src1, dst); + ggml_compute_forward_add1_q_f32(params, dst); } break; default: { @@ -7999,9 +8031,11 @@ static void ggml_compute_forward_add1( static void ggml_compute_forward_acc_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); @@ -8081,14 +8115,14 @@ static void ggml_compute_forward_acc_f32( static void ggml_compute_forward_acc( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_acc_f32(params, src0, src1, dst); + ggml_compute_forward_acc_f32(params, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -8118,9 +8152,11 @@ static void ggml_compute_forward_acc( static void ggml_compute_forward_sub_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); @@ -8178,13 +8214,14 @@ static void ggml_compute_forward_sub_f32( static void ggml_compute_forward_sub( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sub_f32(params, src0, src1, dst); + ggml_compute_forward_sub_f32(params, dst); } break; default: { @@ -8197,9 +8234,11 @@ static void ggml_compute_forward_sub( static void ggml_compute_forward_mul_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8280,15 +8319,17 @@ static void ggml_compute_forward_mul_f32( static void ggml_compute_forward_mul( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_mul_f32(params, src0, src1, dst); + ggml_compute_forward_mul_f32(params, dst); } break; default: { @@ -8301,9 +8342,11 @@ static void ggml_compute_forward_mul( static void ggml_compute_forward_div_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8374,13 +8417,14 @@ static void ggml_compute_forward_div_f32( static void ggml_compute_forward_div( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_div_f32(params, src0, src1, dst); + ggml_compute_forward_div_f32(params, dst); } break; default: { @@ -8393,8 +8437,10 @@ static void ggml_compute_forward_div( static void ggml_compute_forward_sqr_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -8417,12 +8463,14 @@ static void ggml_compute_forward_sqr_f32( static void ggml_compute_forward_sqr( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sqr_f32(params, src0, dst); + ggml_compute_forward_sqr_f32(params, dst); } break; default: { @@ -8435,8 +8483,10 @@ static void ggml_compute_forward_sqr( static void ggml_compute_forward_sqrt_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -8459,12 +8509,14 @@ static void ggml_compute_forward_sqrt_f32( static void ggml_compute_forward_sqrt( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sqrt_f32(params, src0, dst); + ggml_compute_forward_sqrt_f32(params, dst); } break; default: { @@ -8477,8 +8529,10 @@ static void ggml_compute_forward_sqrt( static void ggml_compute_forward_log_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -8501,12 +8555,14 @@ static void ggml_compute_forward_log_f32( static void ggml_compute_forward_log( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_log_f32(params, src0, dst); + ggml_compute_forward_log_f32(params, dst); } break; default: { @@ -8519,8 +8575,10 @@ static void ggml_compute_forward_log( static void ggml_compute_forward_sum_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_is_scalar(dst)); @@ -8552,8 +8610,10 @@ static void ggml_compute_forward_sum_f32( static void ggml_compute_forward_sum_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_is_scalar(dst)); @@ -8584,16 +8644,18 @@ static void ggml_compute_forward_sum_f16( static void ggml_compute_forward_sum( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sum_f32(params, src0, dst); + ggml_compute_forward_sum_f32(params, dst); } break; case GGML_TYPE_F16: { - ggml_compute_forward_sum_f16(params, src0, dst); + ggml_compute_forward_sum_f16(params, dst); } break; default: { @@ -8606,8 +8668,10 @@ static void ggml_compute_forward_sum( static void ggml_compute_forward_sum_rows_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8639,12 +8703,14 @@ static void ggml_compute_forward_sum_rows_f32( static void ggml_compute_forward_sum_rows( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sum_rows_f32(params, src0, dst); + ggml_compute_forward_sum_rows_f32(params, dst); } break; default: { @@ -8657,8 +8723,10 @@ static void ggml_compute_forward_sum_rows( static void ggml_compute_forward_mean_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8694,12 +8762,14 @@ static void ggml_compute_forward_mean_f32( static void ggml_compute_forward_mean( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_mean_f32(params, src0, dst); + ggml_compute_forward_mean_f32(params, dst); } break; default: { @@ -8712,8 +8782,10 @@ static void ggml_compute_forward_mean( static void ggml_compute_forward_argmax_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8740,12 +8812,14 @@ static void ggml_compute_forward_argmax_f32( static void ggml_compute_forward_argmax( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_argmax_f32(params, src0, dst); + ggml_compute_forward_argmax_f32(params, dst); } break; default: { @@ -8758,8 +8832,10 @@ static void ggml_compute_forward_argmax( static void ggml_compute_forward_repeat_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); @@ -8801,8 +8877,10 @@ static void ggml_compute_forward_repeat_f32( static void ggml_compute_forward_repeat_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); @@ -8847,18 +8925,20 @@ static void ggml_compute_forward_repeat_f16( static void ggml_compute_forward_repeat( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: case GGML_TYPE_I16: { - ggml_compute_forward_repeat_f16(params, src0, dst); + ggml_compute_forward_repeat_f16(params, dst); } break; case GGML_TYPE_F32: case GGML_TYPE_I32: { - ggml_compute_forward_repeat_f32(params, src0, dst); + ggml_compute_forward_repeat_f32(params, dst); } break; default: { @@ -8871,8 +8951,10 @@ static void ggml_compute_forward_repeat( static void ggml_compute_forward_repeat_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(dst, src0)); @@ -8928,12 +9010,14 @@ static void ggml_compute_forward_repeat_back_f32( static void ggml_compute_forward_repeat_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_repeat_back_f32(params, src0, dst); + ggml_compute_forward_repeat_back_f32(params, dst); } break; default: { @@ -8946,10 +9030,11 @@ static void ggml_compute_forward_repeat_back( static void ggml_compute_forward_concat_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -8994,14 +9079,15 @@ static void ggml_compute_forward_concat_f32( static void ggml_compute_forward_concat( const struct ggml_compute_params* params, - const struct ggml_tensor* src0, - const struct ggml_tensor* src1, struct ggml_tensor* dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: case GGML_TYPE_I32: { - ggml_compute_forward_concat_f32(params, src0, src1, dst); + ggml_compute_forward_concat_f32(params, dst); } break; default: { @@ -9014,8 +9100,10 @@ static void ggml_compute_forward_concat( static void ggml_compute_forward_abs_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9038,12 +9126,14 @@ static void ggml_compute_forward_abs_f32( static void ggml_compute_forward_abs( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_abs_f32(params, src0, dst); + ggml_compute_forward_abs_f32(params, dst); } break; default: { @@ -9056,8 +9146,10 @@ static void ggml_compute_forward_abs( static void ggml_compute_forward_sgn_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9080,12 +9172,14 @@ static void ggml_compute_forward_sgn_f32( static void ggml_compute_forward_sgn( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sgn_f32(params, src0, dst); + ggml_compute_forward_sgn_f32(params, dst); } break; default: { @@ -9098,8 +9192,10 @@ static void ggml_compute_forward_sgn( static void ggml_compute_forward_neg_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9122,12 +9218,14 @@ static void ggml_compute_forward_neg_f32( static void ggml_compute_forward_neg( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_neg_f32(params, src0, dst); + ggml_compute_forward_neg_f32(params, dst); } break; default: { @@ -9140,8 +9238,10 @@ static void ggml_compute_forward_neg( static void ggml_compute_forward_step_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9164,12 +9264,14 @@ static void ggml_compute_forward_step_f32( static void ggml_compute_forward_step( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_step_f32(params, src0, dst); + ggml_compute_forward_step_f32(params, dst); } break; default: { @@ -9182,8 +9284,10 @@ static void ggml_compute_forward_step( static void ggml_compute_forward_tanh_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9206,12 +9310,14 @@ static void ggml_compute_forward_tanh_f32( static void ggml_compute_forward_tanh( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_tanh_f32(params, src0, dst); + ggml_compute_forward_tanh_f32(params, dst); } break; default: { @@ -9224,8 +9330,10 @@ static void ggml_compute_forward_tanh( static void ggml_compute_forward_elu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9248,12 +9356,14 @@ static void ggml_compute_forward_elu_f32( static void ggml_compute_forward_elu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_elu_f32(params, src0, dst); + ggml_compute_forward_elu_f32(params, dst); } break; default: { @@ -9266,8 +9376,10 @@ static void ggml_compute_forward_elu( static void ggml_compute_forward_relu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9290,12 +9402,14 @@ static void ggml_compute_forward_relu_f32( static void ggml_compute_forward_relu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_relu_f32(params, src0, dst); + ggml_compute_forward_relu_f32(params, dst); } break; default: { @@ -9308,8 +9422,10 @@ static void ggml_compute_forward_relu( static void ggml_compute_forward_gelu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -9349,12 +9465,14 @@ static void ggml_compute_forward_gelu_f32( static void ggml_compute_forward_gelu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_gelu_f32(params, src0, dst); + ggml_compute_forward_gelu_f32(params, dst); } break; default: { @@ -9367,8 +9485,10 @@ static void ggml_compute_forward_gelu( static void ggml_compute_forward_gelu_quick_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -9408,12 +9528,14 @@ static void ggml_compute_forward_gelu_quick_f32( static void ggml_compute_forward_gelu_quick( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_gelu_quick_f32(params, src0, dst); + ggml_compute_forward_gelu_quick_f32(params, dst); } break; default: { @@ -9426,8 +9548,10 @@ static void ggml_compute_forward_gelu_quick( static void ggml_compute_forward_silu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -9467,12 +9591,14 @@ static void ggml_compute_forward_silu_f32( static void ggml_compute_forward_silu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_silu_f32(params, src0, dst); + ggml_compute_forward_silu_f32(params, dst); } break; default: { @@ -9484,8 +9610,10 @@ static void ggml_compute_forward_silu( static void ggml_compute_forward_leaky_relu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9511,12 +9639,14 @@ static void ggml_compute_forward_leaky_relu_f32( static void ggml_compute_forward_leaky_relu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_leaky_relu_f32(params, src0, dst); + ggml_compute_forward_leaky_relu_f32(params, dst); } break; default: { @@ -9529,9 +9659,11 @@ static void ggml_compute_forward_leaky_relu( static void ggml_compute_forward_silu_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * grad, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * grad = dst->src[1]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); @@ -9574,13 +9706,14 @@ static void ggml_compute_forward_silu_back_f32( static void ggml_compute_forward_silu_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * grad, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_silu_back_f32(params, src0, grad, dst); + ggml_compute_forward_silu_back_f32(params, dst); } break; default: { @@ -9592,8 +9725,10 @@ static void ggml_compute_forward_silu_back( static void ggml_compute_forward_hardswish_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9615,12 +9750,14 @@ static void ggml_compute_forward_hardswish_f32( } static void ggml_compute_forward_hardswish( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_hardswish_f32(params, src0, dst); + ggml_compute_forward_hardswish_f32(params, dst); } break; default: { @@ -9631,8 +9768,10 @@ static void ggml_compute_forward_hardswish( static void ggml_compute_forward_hardsigmoid_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9655,12 +9794,14 @@ static void ggml_compute_forward_hardsigmoid_f32( static void ggml_compute_forward_hardsigmoid( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_hardsigmoid_f32(params, src0, dst); + ggml_compute_forward_hardsigmoid_f32(params, dst); } break; default: { @@ -9674,8 +9815,10 @@ static void ggml_compute_forward_hardsigmoid( static void ggml_compute_forward_norm_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -9727,12 +9870,14 @@ static void ggml_compute_forward_norm_f32( static void ggml_compute_forward_norm( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_norm_f32(params, src0, dst); + ggml_compute_forward_norm_f32(params, dst); } break; default: { @@ -9745,8 +9890,10 @@ static void ggml_compute_forward_norm( static void ggml_compute_forward_rms_norm_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -9795,12 +9942,14 @@ static void ggml_compute_forward_rms_norm_f32( static void ggml_compute_forward_rms_norm( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_rms_norm_f32(params, src0, dst); + ggml_compute_forward_rms_norm_f32(params, dst); } break; default: { @@ -9811,9 +9960,11 @@ static void ggml_compute_forward_rms_norm( static void ggml_compute_forward_rms_norm_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -9968,13 +10119,14 @@ static void ggml_compute_forward_rms_norm_back_f32( static void ggml_compute_forward_rms_norm_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst); + ggml_compute_forward_rms_norm_back_f32(params, dst); } break; default: { @@ -9987,8 +10139,10 @@ static void ggml_compute_forward_rms_norm_back( static void ggml_compute_forward_group_norm_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -10059,12 +10213,14 @@ static void ggml_compute_forward_group_norm_f32( static void ggml_compute_forward_group_norm( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_group_norm_f32(params, src0, dst); + ggml_compute_forward_group_norm_f32(params, dst); } break; default: { @@ -10110,9 +10266,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) { static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -10357,10 +10515,11 @@ static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, - const struct ggml_tensor * ids, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * ids = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS @@ -10551,9 +10710,11 @@ static void ggml_compute_forward_mul_mat_id( static void ggml_compute_forward_out_prod_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = ggml_perf_time_us(); // UNUSED(t0); @@ -10743,9 +10904,11 @@ static void ggml_compute_forward_out_prod_f32( static void ggml_compute_forward_out_prod_q_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = ggml_perf_time_us(); // UNUSED(t0); @@ -10856,9 +11019,10 @@ static void ggml_compute_forward_out_prod_q_f32( static void ggml_compute_forward_out_prod( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -10876,16 +11040,16 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + ggml_compute_forward_out_prod_q_f32(params, dst); } break; case GGML_TYPE_F16: { GGML_ASSERT(false); // todo - // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst); + // ggml_compute_forward_out_prod_f16_f32(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_out_prod_f32(params, src0, src1, dst); + ggml_compute_forward_out_prod_f32(params, dst); } break; default: { @@ -10898,8 +11062,10 @@ static void ggml_compute_forward_out_prod( static void ggml_compute_forward_scale_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -10940,12 +11106,14 @@ static void ggml_compute_forward_scale_f32( static void ggml_compute_forward_scale( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_scale_f32(params, src0, dst); + ggml_compute_forward_scale_f32(params, dst); } break; default: { @@ -10958,9 +11126,11 @@ static void ggml_compute_forward_scale( static void ggml_compute_forward_set_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); @@ -11031,14 +11201,14 @@ static void ggml_compute_forward_set_f32( static void ggml_compute_forward_set( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_set_f32(params, src0, src1, dst); + ggml_compute_forward_set_f32(params, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -11068,29 +11238,25 @@ static void ggml_compute_forward_set( static void ggml_compute_forward_cpy( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, src0, dst); + ggml_compute_forward_dup(params, dst); } // ggml_compute_forward_cont static void ggml_compute_forward_cont( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, src0, dst); + ggml_compute_forward_dup(params, dst); } // ggml_compute_forward_reshape static void ggml_compute_forward_reshape( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); UNUSED(dst); } @@ -11098,39 +11264,41 @@ static void ggml_compute_forward_reshape( static void ggml_compute_forward_view( const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { + const struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // ggml_compute_forward_permute static void ggml_compute_forward_permute( const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { + const struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // ggml_compute_forward_transpose static void ggml_compute_forward_transpose( const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { + const struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // ggml_compute_forward_get_rows static void ggml_compute_forward_get_rows_q( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11166,9 +11334,11 @@ static void ggml_compute_forward_get_rows_q( static void ggml_compute_forward_get_rows_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11201,9 +11371,11 @@ static void ggml_compute_forward_get_rows_f16( static void ggml_compute_forward_get_rows_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11236,9 +11408,10 @@ static void ggml_compute_forward_get_rows_f32( static void ggml_compute_forward_get_rows( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -11257,16 +11430,16 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_get_rows_q(params, src0, src1, dst); + ggml_compute_forward_get_rows_q(params, dst); } break; case GGML_TYPE_F16: { - ggml_compute_forward_get_rows_f16(params, src0, src1, dst); + ggml_compute_forward_get_rows_f16(params, dst); } break; case GGML_TYPE_F32: case GGML_TYPE_I32: { - ggml_compute_forward_get_rows_f32(params, src0, src1, dst); + ggml_compute_forward_get_rows_f32(params, dst); } break; default: { @@ -11297,9 +11470,11 @@ static void ggml_compute_forward_get_rows( static void ggml_compute_forward_get_rows_back_f32_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_is_contiguous(dst)); @@ -11334,9 +11509,11 @@ static void ggml_compute_forward_get_rows_back_f32_f16( static void ggml_compute_forward_get_rows_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_is_contiguous(dst)); @@ -11371,17 +11548,18 @@ static void ggml_compute_forward_get_rows_back_f32( static void ggml_compute_forward_get_rows_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst); + ggml_compute_forward_get_rows_back_f32_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst); + ggml_compute_forward_get_rows_back_f32(params, dst); } break; default: { @@ -11412,8 +11590,10 @@ static void ggml_compute_forward_get_rows_back( static void ggml_compute_forward_diag_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11452,12 +11632,14 @@ static void ggml_compute_forward_diag_f32( static void ggml_compute_forward_diag( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_f32(params, src0, dst); + ggml_compute_forward_diag_f32(params, dst); } break; default: { @@ -11470,10 +11652,11 @@ static void ggml_compute_forward_diag( static void ggml_compute_forward_diag_mask_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst, const float value) { + const struct ggml_tensor * src0 = dst->src[0]; + const int ith = params->ith; const int nth = params->nth; @@ -11523,12 +11706,14 @@ static void ggml_compute_forward_diag_mask_f32( static void ggml_compute_forward_diag_mask_inf( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY); + ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY); } break; default: { @@ -11539,12 +11724,14 @@ static void ggml_compute_forward_diag_mask_inf( static void ggml_compute_forward_diag_mask_zero( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, dst, 0); + ggml_compute_forward_diag_mask_f32(params, dst, 0); } break; default: { @@ -11557,10 +11744,12 @@ static void ggml_compute_forward_diag_mask_zero( static void ggml_compute_forward_soft_max_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; + assert(ggml_is_contiguous(dst)); assert(ggml_are_same_shape(src0, dst)); @@ -11671,14 +11860,14 @@ static void ggml_compute_forward_soft_max_f32( static void ggml_compute_forward_soft_max( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_soft_max_f32(params, src0, src1, src2, dst); + ggml_compute_forward_soft_max_f32(params, dst); } break; default: { @@ -11691,9 +11880,11 @@ static void ggml_compute_forward_soft_max( static void ggml_compute_forward_soft_max_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_contiguous(dst)); @@ -11768,13 +11959,14 @@ static void ggml_compute_forward_soft_max_back_f32( static void ggml_compute_forward_soft_max_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst); + ggml_compute_forward_soft_max_back_f32(params, dst); } break; default: { @@ -11787,8 +11979,10 @@ static void ggml_compute_forward_soft_max_back( static void ggml_compute_forward_alibi_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11844,8 +12038,10 @@ static void ggml_compute_forward_alibi_f32( static void ggml_compute_forward_alibi_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11904,16 +12100,18 @@ static void ggml_compute_forward_alibi_f16( static void ggml_compute_forward_alibi( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_alibi_f16(params, src0, dst); + ggml_compute_forward_alibi_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_alibi_f32(params, src0, dst); + ggml_compute_forward_alibi_f32(params, dst); } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -11946,8 +12144,10 @@ static void ggml_compute_forward_alibi( static void ggml_compute_forward_clamp_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11986,12 +12186,14 @@ static void ggml_compute_forward_clamp_f32( static void ggml_compute_forward_clamp( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_clamp_f32(params, src0, dst); + ggml_compute_forward_clamp_f32(params, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -12081,10 +12283,12 @@ GGML_CALL void ggml_rope_yarn_corr_dims( static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const bool forward) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12257,10 +12461,12 @@ static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const bool forward) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12422,17 +12628,18 @@ static void ggml_compute_forward_rope_f16( static void ggml_compute_forward_rope( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst, true); + ggml_compute_forward_rope_f16(params, dst, true); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst, true); + ggml_compute_forward_rope_f32(params, dst, true); } break; default: { @@ -12445,17 +12652,18 @@ static void ggml_compute_forward_rope( static void ggml_compute_forward_rope_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst, false); + ggml_compute_forward_rope_f16(params, dst, false); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst, false); + ggml_compute_forward_rope_f32(params, dst, false); } break; default: { @@ -12468,9 +12676,11 @@ static void ggml_compute_forward_rope_back( static void ggml_compute_forward_conv_transpose_1d_f16_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12565,9 +12775,11 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( static void ggml_compute_forward_conv_transpose_1d_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12662,17 +12874,18 @@ static void ggml_compute_forward_conv_transpose_1d_f32( static void ggml_compute_forward_conv_transpose_1d( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst); + ggml_compute_forward_conv_transpose_1d_f32(params, dst); } break; default: { @@ -12686,9 +12899,11 @@ static void ggml_compute_forward_conv_transpose_1d( // dst: result [N, OH, OW, IC*KH*KW] static void ggml_compute_forward_im2col_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12772,9 +12987,11 @@ static void ggml_compute_forward_im2col_f32( // dst: result [N, OH, OW, IC*KH*KW] static void ggml_compute_forward_im2col_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16); @@ -12854,17 +13071,15 @@ static void ggml_compute_forward_im2col_f16( static void ggml_compute_forward_im2col( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (dst->type) { case GGML_TYPE_F16: { - ggml_compute_forward_im2col_f16(params, src0, src1, dst); + ggml_compute_forward_im2col_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_im2col_f32(params, src0, src1, dst); + ggml_compute_forward_im2col_f32(params, dst); } break; default: { @@ -12878,9 +13093,11 @@ static void ggml_compute_forward_im2col( static void ggml_compute_forward_conv_transpose_2d( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12984,9 +13201,11 @@ static void ggml_compute_forward_conv_transpose_2d( static void ggml_compute_forward_pool_1d_sk_p0( const struct ggml_compute_params * params, const enum ggml_op_pool op, - const struct ggml_tensor * src, const int k, struct ggml_tensor * dst) { + + const struct ggml_tensor * src = dst->src[0]; + assert(src->type == GGML_TYPE_F32); assert(params->ith == 0); @@ -13035,7 +13254,6 @@ static void ggml_compute_forward_pool_1d_sk_p0( static void ggml_compute_forward_pool_1d( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { const int32_t * opts = (const int32_t *)dst->op_params; @@ -13046,15 +13264,17 @@ static void ggml_compute_forward_pool_1d( GGML_ASSERT(p0 == 0); // padding not supported GGML_ASSERT(k0 == s0); // only s = k supported - ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst); + ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); } // ggml_compute_forward_pool_2d static void ggml_compute_forward_pool_2d( const struct ggml_compute_params * params, - const struct ggml_tensor * src, struct ggml_tensor * dst) { + + const struct ggml_tensor * src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(params->ith == 0); @@ -13127,9 +13347,10 @@ static void ggml_compute_forward_pool_2d( static void ggml_compute_forward_upscale_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -13166,12 +13387,14 @@ static void ggml_compute_forward_upscale_f32( static void ggml_compute_forward_upscale( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_upscale_f32(params, src0, dst); + ggml_compute_forward_upscale_f32(params, dst); } break; default: { @@ -13184,9 +13407,10 @@ static void ggml_compute_forward_upscale( static void ggml_compute_forward_pad_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -13224,12 +13448,14 @@ static void ggml_compute_forward_pad_f32( static void ggml_compute_forward_pad( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_pad_f32(params, src0, dst); + ggml_compute_forward_pad_f32(params, dst); } break; default: { @@ -13242,9 +13468,10 @@ static void ggml_compute_forward_pad( static void ggml_compute_forward_argsort_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -13284,13 +13511,14 @@ static void ggml_compute_forward_argsort_f32( static void ggml_compute_forward_argsort( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_argsort_f32(params, src0, dst); + ggml_compute_forward_argsort_f32(params, dst); } break; default: { @@ -13303,11 +13531,13 @@ static void ggml_compute_forward_argsort( static void ggml_compute_forward_flash_attn_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13493,11 +13723,13 @@ static void ggml_compute_forward_flash_attn_f32( static void ggml_compute_forward_flash_attn_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13719,19 +13951,19 @@ static void ggml_compute_forward_flash_attn_f16( static void ggml_compute_forward_flash_attn( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + switch (q->type) { case GGML_TYPE_F16: { - ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); + ggml_compute_forward_flash_attn_f16(params, masked, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); + ggml_compute_forward_flash_attn_f32(params, masked, dst); } break; default: { @@ -13744,12 +13976,14 @@ static void ggml_compute_forward_flash_attn( static void ggml_compute_forward_flash_ff_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * a, // F16 - const struct ggml_tensor * b0, // F16 fc_w - const struct ggml_tensor * b1, // F32 fc_b - const struct ggml_tensor * c0, // F16 proj_w - const struct ggml_tensor * c1, // F32 proj_b struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; // F16 + const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w + const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b + const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w + const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13877,16 +14111,14 @@ static void ggml_compute_forward_flash_ff_f16( static void ggml_compute_forward_flash_ff( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b0, - const struct ggml_tensor * b1, - const struct ggml_tensor * c0, - const struct ggml_tensor * c1, struct ggml_tensor * dst) { + + const struct ggml_tensor * b0 = dst->src[1]; + switch (b0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); + ggml_compute_forward_flash_ff_f16(params, dst); } break; case GGML_TYPE_F32: { @@ -13903,12 +14135,14 @@ static void ggml_compute_forward_flash_ff( static void ggml_compute_forward_flash_attn_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, - const struct ggml_tensor * d, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + const struct ggml_tensor * d = dst->src[3]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -14256,16 +14490,15 @@ static void ggml_compute_forward_flash_attn_back_f32( static void ggml_compute_forward_flash_attn_back( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, - const struct ggml_tensor * d, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + switch (q->type) { case GGML_TYPE_F32: { - ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst); + ggml_compute_forward_flash_attn_back_f32(params, masked, dst); } break; default: { @@ -14278,8 +14511,10 @@ static void ggml_compute_forward_flash_attn_back( static void ggml_compute_forward_win_part_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14322,12 +14557,14 @@ static void ggml_compute_forward_win_part_f32( static void ggml_compute_forward_win_part( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_part_f32(params, src0, dst); + ggml_compute_forward_win_part_f32(params, dst); } break; default: { @@ -14340,8 +14577,10 @@ static void ggml_compute_forward_win_part( static void ggml_compute_forward_win_unpart_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14382,12 +14621,14 @@ static void ggml_compute_forward_win_unpart_f32( static void ggml_compute_forward_win_unpart( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_unpart_f32(params, src0, dst); + ggml_compute_forward_win_unpart_f32(params, dst); } break; default: { @@ -14400,58 +14641,58 @@ static void ggml_compute_forward_win_unpart( static void ggml_compute_forward_unary( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const enum ggml_unary_op op = ggml_get_unary_op(dst); switch (op) { case GGML_UNARY_OP_ABS: { - ggml_compute_forward_abs(params, src0, dst); + ggml_compute_forward_abs(params, dst); } break; case GGML_UNARY_OP_SGN: { - ggml_compute_forward_sgn(params, src0, dst); + ggml_compute_forward_sgn(params, dst); } break; case GGML_UNARY_OP_NEG: { - ggml_compute_forward_neg(params, src0, dst); + ggml_compute_forward_neg(params, dst); } break; case GGML_UNARY_OP_STEP: { - ggml_compute_forward_step(params, src0, dst); + ggml_compute_forward_step(params, dst); } break; case GGML_UNARY_OP_TANH: { - ggml_compute_forward_tanh(params, src0, dst); + ggml_compute_forward_tanh(params, dst); } break; case GGML_UNARY_OP_ELU: { - ggml_compute_forward_elu(params, src0, dst); + ggml_compute_forward_elu(params, dst); } break; case GGML_UNARY_OP_RELU: { - ggml_compute_forward_relu(params, src0, dst); + ggml_compute_forward_relu(params, dst); } break; case GGML_UNARY_OP_GELU: { - ggml_compute_forward_gelu(params, src0, dst); + ggml_compute_forward_gelu(params, dst); } break; case GGML_UNARY_OP_GELU_QUICK: { - ggml_compute_forward_gelu_quick(params, src0, dst); + ggml_compute_forward_gelu_quick(params, dst); } break; case GGML_UNARY_OP_SILU: { - ggml_compute_forward_silu(params, src0, dst); + ggml_compute_forward_silu(params, dst); } break; case GGML_UNARY_OP_HARDSWISH: { - ggml_compute_forward_hardswish(params, src0, dst); + ggml_compute_forward_hardswish(params, dst); } break; case GGML_UNARY_OP_HARDSIGMOID: { - ggml_compute_forward_hardsigmoid(params, src0, dst); + ggml_compute_forward_hardsigmoid(params, dst); } break; default: { @@ -14464,8 +14705,10 @@ static void ggml_compute_forward_unary( static void ggml_compute_forward_get_rel_pos_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14491,12 +14734,14 @@ static void ggml_compute_forward_get_rel_pos_f16( static void ggml_compute_forward_get_rel_pos( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_get_rel_pos_f16(params, src0, dst); + ggml_compute_forward_get_rel_pos_f16(params, dst); } break; default: { @@ -14509,11 +14754,12 @@ static void ggml_compute_forward_get_rel_pos( static void ggml_compute_forward_add_rel_pos_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; + const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; if (!inplace && params->type == GGML_TASK_INIT) { if (params->ith != 0) { @@ -14577,14 +14823,14 @@ static void ggml_compute_forward_add_rel_pos_f32( static void ggml_compute_forward_add_rel_pos( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst); + ggml_compute_forward_add_rel_pos_f32(params, dst); } break; default: { @@ -14597,9 +14843,11 @@ static void ggml_compute_forward_add_rel_pos( static void ggml_compute_forward_map_unary_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst, const ggml_unary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14621,13 +14869,15 @@ static void ggml_compute_forward_map_unary_f32( static void ggml_compute_forward_map_unary( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst, const ggml_unary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + ggml_compute_forward_map_unary_f32(params, dst, fun); } break; default: { @@ -14640,10 +14890,12 @@ static void ggml_compute_forward_map_unary( static void ggml_compute_forward_map_binary_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const ggml_binary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); @@ -14668,14 +14920,15 @@ static void ggml_compute_forward_map_binary_f32( static void ggml_compute_forward_map_binary( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const ggml_binary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); + ggml_compute_forward_map_binary_f32(params, dst, fun); } break; default: { @@ -14688,9 +14941,11 @@ static void ggml_compute_forward_map_binary( static void ggml_compute_forward_map_custom1_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * a, struct ggml_tensor * dst, const ggml_custom1_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14704,10 +14959,12 @@ static void ggml_compute_forward_map_custom1_f32( static void ggml_compute_forward_map_custom2_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, struct ggml_tensor * dst, const ggml_custom2_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14721,11 +14978,13 @@ static void ggml_compute_forward_map_custom2_f32( static void ggml_compute_forward_map_custom3_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, - const struct ggml_tensor * c, struct ggml_tensor * dst, const ggml_custom3_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + const struct ggml_tensor * c = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14739,8 +14998,10 @@ static void ggml_compute_forward_map_custom3_f32( static void ggml_compute_forward_map_custom1( const struct ggml_compute_params * params, - const struct ggml_tensor * a, struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14754,9 +15015,11 @@ static void ggml_compute_forward_map_custom1( static void ggml_compute_forward_map_custom2( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14770,10 +15033,12 @@ static void ggml_compute_forward_map_custom2( static void ggml_compute_forward_map_custom3( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, - const struct ggml_tensor * c, struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + const struct ggml_tensor * c = dst->src[2]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14787,9 +15052,11 @@ static void ggml_compute_forward_map_custom3( static void ggml_compute_forward_cross_entropy_loss_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_scalar(dst)); @@ -14893,13 +15160,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32( static void ggml_compute_forward_cross_entropy_loss( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst); + ggml_compute_forward_cross_entropy_loss_f32(params, dst); } break; default: { @@ -14912,10 +15180,12 @@ static void ggml_compute_forward_cross_entropy_loss( static void ggml_compute_forward_cross_entropy_loss_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * opt0 = dst->src[2]; + GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); @@ -15002,14 +15272,14 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( static void ggml_compute_forward_cross_entropy_loss_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_cross_entropy_loss_back_f32(params, dst); } break; default: { @@ -15057,312 +15327,312 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm switch (tensor->op) { case GGML_OP_DUP: { - ggml_compute_forward_dup(params, tensor->src[0], tensor); + ggml_compute_forward_dup(params, tensor); } break; case GGML_OP_ADD: { - ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_add(params, tensor); } break; case GGML_OP_ADD1: { - ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_add1(params, tensor); } break; case GGML_OP_ACC: { - ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_acc(params, tensor); } break; case GGML_OP_SUB: { - ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_sub(params, tensor); } break; case GGML_OP_MUL: { - ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul(params, tensor); } break; case GGML_OP_DIV: { - ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_div(params, tensor); } break; case GGML_OP_SQR: { - ggml_compute_forward_sqr(params, tensor->src[0], tensor); + ggml_compute_forward_sqr(params, tensor); } break; case GGML_OP_SQRT: { - ggml_compute_forward_sqrt(params, tensor->src[0], tensor); + ggml_compute_forward_sqrt(params, tensor); } break; case GGML_OP_LOG: { - ggml_compute_forward_log(params, tensor->src[0], tensor); + ggml_compute_forward_log(params, tensor); } break; case GGML_OP_SUM: { - ggml_compute_forward_sum(params, tensor->src[0], tensor); + ggml_compute_forward_sum(params, tensor); } break; case GGML_OP_SUM_ROWS: { - ggml_compute_forward_sum_rows(params, tensor->src[0], tensor); + ggml_compute_forward_sum_rows(params, tensor); } break; case GGML_OP_MEAN: { - ggml_compute_forward_mean(params, tensor->src[0], tensor); + ggml_compute_forward_mean(params, tensor); } break; case GGML_OP_ARGMAX: { - ggml_compute_forward_argmax(params, tensor->src[0], tensor); + ggml_compute_forward_argmax(params, tensor); } break; case GGML_OP_REPEAT: { - ggml_compute_forward_repeat(params, tensor->src[0], tensor); + ggml_compute_forward_repeat(params, tensor); } break; case GGML_OP_REPEAT_BACK: { - ggml_compute_forward_repeat_back(params, tensor->src[0], tensor); + ggml_compute_forward_repeat_back(params, tensor); } break; case GGML_OP_CONCAT: { - ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_concat(params, tensor); } break; case GGML_OP_SILU_BACK: { - ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_silu_back(params, tensor); } break; case GGML_OP_NORM: { - ggml_compute_forward_norm(params, tensor->src[0], tensor); + ggml_compute_forward_norm(params, tensor); } break; case GGML_OP_RMS_NORM: { - ggml_compute_forward_rms_norm(params, tensor->src[0], tensor); + ggml_compute_forward_rms_norm(params, tensor); } break; case GGML_OP_RMS_NORM_BACK: { - ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rms_norm_back(params, tensor); } break; case GGML_OP_GROUP_NORM: { - ggml_compute_forward_group_norm(params, tensor->src[0], tensor); + ggml_compute_forward_group_norm(params, tensor); } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul_mat(params, tensor); } break; case GGML_OP_MUL_MAT_ID: { - ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul_mat_id(params, tensor); } break; case GGML_OP_OUT_PROD: { - ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_out_prod(params, tensor); } break; case GGML_OP_SCALE: { - ggml_compute_forward_scale(params, tensor->src[0], tensor); + ggml_compute_forward_scale(params, tensor); } break; case GGML_OP_SET: { - ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_set(params, tensor); } break; case GGML_OP_CPY: { - ggml_compute_forward_cpy(params, tensor->src[0], tensor); + ggml_compute_forward_cpy(params, tensor); } break; case GGML_OP_CONT: { - ggml_compute_forward_cont(params, tensor->src[0], tensor); + ggml_compute_forward_cont(params, tensor); } break; case GGML_OP_RESHAPE: { - ggml_compute_forward_reshape(params, tensor->src[0], tensor); + ggml_compute_forward_reshape(params, tensor); } break; case GGML_OP_VIEW: { - ggml_compute_forward_view(params, tensor->src[0]); + ggml_compute_forward_view(params, tensor); } break; case GGML_OP_PERMUTE: { - ggml_compute_forward_permute(params, tensor->src[0]); + ggml_compute_forward_permute(params, tensor); } break; case GGML_OP_TRANSPOSE: { - ggml_compute_forward_transpose(params, tensor->src[0]); + ggml_compute_forward_transpose(params, tensor); } break; case GGML_OP_GET_ROWS: { - ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_get_rows(params, tensor); } break; case GGML_OP_GET_ROWS_BACK: { - ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_get_rows_back(params, tensor); } break; case GGML_OP_DIAG: { - ggml_compute_forward_diag(params, tensor->src[0], tensor); + ggml_compute_forward_diag(params, tensor); } break; case GGML_OP_DIAG_MASK_INF: { - ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor); + ggml_compute_forward_diag_mask_inf(params, tensor); } break; case GGML_OP_DIAG_MASK_ZERO: { - ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor); + ggml_compute_forward_diag_mask_zero(params, tensor); } break; case GGML_OP_SOFT_MAX: { - ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_soft_max(params, tensor); } break; case GGML_OP_SOFT_MAX_BACK: { - ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_soft_max_back(params, tensor); } break; case GGML_OP_ROPE: { - ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope(params, tensor); } break; case GGML_OP_ROPE_BACK: { - ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope_back(params, tensor); } break; case GGML_OP_ALIBI: { - ggml_compute_forward_alibi(params, tensor->src[0], tensor); + ggml_compute_forward_alibi(params, tensor); } break; case GGML_OP_CLAMP: { - ggml_compute_forward_clamp(params, tensor->src[0], tensor); + ggml_compute_forward_clamp(params, tensor); } break; case GGML_OP_CONV_TRANSPOSE_1D: { - ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_conv_transpose_1d(params, tensor); } break; case GGML_OP_IM2COL: { - ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_im2col(params, tensor); } break; case GGML_OP_CONV_TRANSPOSE_2D: { - ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_conv_transpose_2d(params, tensor); } break; case GGML_OP_POOL_1D: { - ggml_compute_forward_pool_1d(params, tensor->src[0], tensor); + ggml_compute_forward_pool_1d(params, tensor); } break; case GGML_OP_POOL_2D: { - ggml_compute_forward_pool_2d(params, tensor->src[0], tensor); + ggml_compute_forward_pool_2d(params, tensor); } break; case GGML_OP_UPSCALE: { - ggml_compute_forward_upscale(params, tensor->src[0], tensor); + ggml_compute_forward_upscale(params, tensor); } break; case GGML_OP_PAD: { - ggml_compute_forward_pad(params, tensor->src[0], tensor); + ggml_compute_forward_pad(params, tensor); } break; case GGML_OP_ARGSORT: { - ggml_compute_forward_argsort(params, tensor->src[0], tensor); + ggml_compute_forward_argsort(params, tensor); } break; case GGML_OP_LEAKY_RELU: { - ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor); + ggml_compute_forward_leaky_relu(params, tensor); } break; case GGML_OP_FLASH_ATTN: { const int32_t t = ggml_get_op_params_i32(tensor, 0); GGML_ASSERT(t == 0 || t == 1); const bool masked = t != 0; - ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor); + ggml_compute_forward_flash_attn(params, masked, tensor); } break; case GGML_OP_FLASH_FF: { - ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor); + ggml_compute_forward_flash_ff(params, tensor); } break; case GGML_OP_FLASH_ATTN_BACK: { int32_t t = ggml_get_op_params_i32(tensor, 0); GGML_ASSERT(t == 0 || t == 1); bool masked = t != 0; - ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor); + ggml_compute_forward_flash_attn_back(params, masked, tensor); } break; case GGML_OP_WIN_PART: { - ggml_compute_forward_win_part(params, tensor->src[0], tensor); + ggml_compute_forward_win_part(params, tensor); } break; case GGML_OP_WIN_UNPART: { - ggml_compute_forward_win_unpart(params, tensor->src[0], tensor); + ggml_compute_forward_win_unpart(params, tensor); } break; case GGML_OP_UNARY: { - ggml_compute_forward_unary(params, tensor->src[0], tensor); + ggml_compute_forward_unary(params, tensor); } break; case GGML_OP_GET_REL_POS: { - ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor); + ggml_compute_forward_get_rel_pos(params, tensor); } break; case GGML_OP_ADD_REL_POS: { - ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_add_rel_pos(params, tensor); } break; case GGML_OP_MAP_UNARY: { ggml_unary_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); + ggml_compute_forward_map_unary(params, tensor, fun); } break; case GGML_OP_MAP_BINARY: { ggml_binary_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); + ggml_compute_forward_map_binary(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM1_F32: { ggml_custom1_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun); + ggml_compute_forward_map_custom1_f32(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM2_F32: { ggml_custom2_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun); + ggml_compute_forward_map_custom2_f32(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM3_F32: { ggml_custom3_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + ggml_compute_forward_map_custom3_f32(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM1: { - ggml_compute_forward_map_custom1(params, tensor->src[0], tensor); + ggml_compute_forward_map_custom1(params, tensor); } break; case GGML_OP_MAP_CUSTOM2: { - ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_map_custom2(params, tensor); } break; case GGML_OP_MAP_CUSTOM3: { - ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_map_custom3(params, tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_cross_entropy_loss(params, tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_cross_entropy_loss_back(params, tensor); } break; case GGML_OP_NONE: diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 733d8f95b93df..97f34ac851722 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -818eeb8a3be99125746a90ec63af8f51516a2ec6 +4712fd12d7acb9971f850b1b98588f934cb39444 From a00a35cef93e057eace8351a667d14d152a91ebc Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 21 Feb 2024 15:39:10 +0100 Subject: [PATCH 11/51] readme : add LocalAI to the availables UI (#5629) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ce5dec7caeac9..c1624b9f9a348 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [nat/openplayground](https://github.com/nat/openplayground) - [Faraday](https://faraday.dev/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary) +- [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) From 1ecea255ebb70750b52688393f37a63606b90e3f Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Wed, 21 Feb 2024 15:47:48 +0100 Subject: [PATCH 12/51] server: health: fix race condition on slots data using tasks queue (#5634) * server: health: fix race condition on slots data using tasks queue * server: health: * include_slots only if slots_endpoint * fix compile warning task.target_id not initialized. --- examples/server/README.md | 2 + examples/server/server.cpp | 122 ++++++++++++++++++++++++------------- examples/server/utils.hpp | 3 +- 3 files changed, 84 insertions(+), 43 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index f6b9c74021f30..6d9f96cd4ba64 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -140,6 +140,8 @@ node index.js - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available. - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available. + If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. + - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. *Options:* diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eb01729fa7a8a..1c44795128fc1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1394,6 +1394,46 @@ struct llama_server_context case TASK_TYPE_NEXT_RESPONSE: { // do nothing } break; + case TASK_TYPE_SLOTS_DATA: { + json slots_data = json::array(); + int n_idle_slots = 0; + int n_processing_slots = 0; + + for (llama_client_slot &slot: slots) { + if (slot.available()) { + n_idle_slots++; + } else { + n_processing_slots++; + } + json slot_data = get_formated_generation(slot); + slot_data["id"] = slot.id; + slot_data["task_id"] = slot.task_id; + slot_data["state"] = slot.state; + slot_data["prompt"] = slot.prompt; + slot_data["next_token"] = { + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"num_tokens_predicted", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }; + slots_data.push_back(slot_data); + } + LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); + task_result res; + res.id = task.id; + res.multitask_id = task.multitask_id; + res.stop = true; + res.error = false; + res.result_json = { + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "slots", slots_data } + }; + queue_results.send(res); + } break; } } @@ -2557,34 +2597,38 @@ int main(int argc, char **argv) server_state current_state = state.load(); switch(current_state) { case SERVER_STATE_READY: { - int available_slots = 0; - int processing_slots = 0; - for (llama_client_slot &slot: llama.slots) { - if (slot.available()) { - available_slots++; - } else { - processing_slots++; - } + // request slots data using task queue + task_server task; + task.id = llama.queue_tasks.get_new_id(); + task.type = TASK_TYPE_SLOTS_DATA; + task.target_id = -1; + + llama.queue_results.add_waiting_task_id(task.id); + llama.queue_tasks.post(task); + + // get the result + task_result result = llama.queue_results.recv(task.id); + llama.queue_results.remove_waiting_task_id(task.id); + + int n_idle_slots = result.result_json["idle"]; + int n_processing_slots = result.result_json["processing"]; + + json health = { + {"status", "ok"}, + {"slots_idle", n_idle_slots}, + {"slots_processing", n_processing_slots}}; + res.status = 200; // HTTP OK + if (sparams.slots_endpoint && req.has_param("include_slots")) { + health["slots"] = result.result_json["slots"]; } - if (available_slots > 0) { - json health = { - {"status", "ok"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); - res.status = 200; // HTTP OK - } else { - json health = { - {"status", "no slot available"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); + + if (n_idle_slots == 0) { + health["status"] = "no slot available"; if (req.has_param("fail_on_no_slot")) { res.status = 503; // HTTP Service Unavailable - } else { - res.status = 200; // HTTP OK } } + res.set_content(health.dump(), "application/json"); break; } case SERVER_STATE_LOADING_MODEL: @@ -2600,26 +2644,20 @@ int main(int argc, char **argv) if (sparams.slots_endpoint) { svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) { - json slots; - for (llama_client_slot & slot : llama.slots) { - json slot_data = llama.get_formated_generation(slot); - slot_data["id"] = slot.id; - slot_data["task_id"] = slot.task_id; - slot_data["state"] = slot.state; - slot_data["prompt"] = slot.prompt; - slot_data["next_token"] = { - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"num_tokens_predicted", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }; + // request slots data using task queue + task_server task; + task.id = llama.queue_tasks.get_new_id(); + task.type = TASK_TYPE_SLOTS_DATA; + task.target_id = -1; - slots.push_back(slot_data); - } - res.set_content(slots.dump(), "application/json"); + llama.queue_results.add_waiting_task_id(task.id); + llama.queue_tasks.post(task); + + // get the result + task_result result = llama.queue_results.recv(task.id); + llama.queue_results.remove_waiting_task_id(task.id); + + res.set_content(result.result_json["slots"].dump(), "application/json"); res.status = 200; // HTTP OK }); } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index e954fb0ef0413..88545eb6931d0 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -49,7 +49,8 @@ enum server_state { enum task_type { TASK_TYPE_COMPLETION, TASK_TYPE_CANCEL, - TASK_TYPE_NEXT_RESPONSE + TASK_TYPE_NEXT_RESPONSE, + TASK_TYPE_SLOTS_DATA }; struct task_server { From 5022cf242d689e15defd133f96c4345ad30c5d19 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 21 Feb 2024 16:52:39 +0200 Subject: [PATCH 13/51] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 97f34ac851722..bbbf88d9d6ff7 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -4712fd12d7acb9971f850b1b98588f934cb39444 +30805514e1bf389a59d30a54a0525cbdc30d5bd1 From 89febfed9322c8849520dc63c93ee4f5fd72556e Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Wed, 21 Feb 2024 10:33:54 -0500 Subject: [PATCH 14/51] examples : do not assume BOS when shifting context (#5622) --- examples/main/main.cpp | 12 +++++++----- examples/server/server.cpp | 13 +++++++------ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f5d2f48935eb6..7555dffe441f0 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -334,6 +334,8 @@ int main(int argc, char ** argv) { // number of tokens to keep when resetting context if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) { params.n_keep = (int)embd_inp.size(); + } else { + params.n_keep += add_bos; // always keep the BOS token } // prefix & suffix for instruct mode @@ -383,8 +385,8 @@ int main(int argc, char ** argv) { } } - if (params.n_keep > 0) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); + if (params.n_keep > add_bos) { + LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } @@ -540,14 +542,14 @@ int main(int argc, char ** argv) { break; } - const int n_left = n_past - params.n_keep - 1; + const int n_left = n_past - params.n_keep; const int n_discard = n_left/2; LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1c44795128fc1..c84719a0d15d0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1487,14 +1487,15 @@ struct llama_server_context if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx) { // Shift context - const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1; + const int n_keep = slot.params.n_keep + add_bos_token; + const int n_left = system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard); + LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); - for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) + for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; } @@ -1507,7 +1508,7 @@ struct llama_server_context LOG_VERBOSE("context shift", { { "n_ctx", n_ctx }, - { "n_keep", params.n_keep }, + { "n_keep", n_keep }, { "n_left", n_left }, }); } From ba2135ccae7462470b3865c6e41d2e1d734eac05 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 21 Feb 2024 22:18:23 +0100 Subject: [PATCH 15/51] gemma : allow offloading the output tensor (#5646) --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 3a226c4260c0b..4054d5da63fc4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4394,6 +4394,8 @@ static bool llm_load_tensors( // output model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading + ml.n_created--; // artificial tensor const int64_t n_ff = hparams.n_ff; const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -7525,7 +7527,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); From 7fe4678b0244ba7b03eae66ebeaa947e2770bb1a Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 21 Feb 2024 22:52:39 +0100 Subject: [PATCH 16/51] llama : fix session save/load with quantized KV (#5649) --- llama.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4054d5da63fc4..d763cc80cb4c2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12176,18 +12176,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat data_ctx->write(&kv_used, sizeof(kv_used)); if (kv_buf_size) { - const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head); + size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + tmp_buf.resize(k_size); ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); // v is not contiguous, copy row by row - tmp_buf.resize(elt_size*kv_head); + size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + tmp_buf.resize(v_row_size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size()); + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); } } @@ -12289,17 +12290,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { if (kv_buf_size) { GGML_ASSERT(kv_self.total_size() == kv_buf_size); - const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - for (int il = 0; il < (int) n_layer; ++il) { - size_t k_size = elt_size*n_embd_k_gqa*kv_head; + size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); inp += k_size; // v is not contiguous, copy row by row - size_t v_row_size = elt_size*kv_head; + size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size); + ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); inp += v_row_size; } } From 7c8bcc11dc61cf5930b70cd0168b84afcebe12a9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 22 Feb 2024 00:31:00 +0100 Subject: [PATCH 17/51] Add docs for llama_chat_apply_template (#5645) * add docs for llama_chat_apply_template * fix typo --- examples/server/README.md | 1 + llama.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 6d9f96cd4ba64..4b24ee5dc3f28 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` - `-n, --n-predict`: Set the maximum tokens to predict (default: -1) - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. +- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) ## Build diff --git a/llama.h b/llama.h index 8ba20696f8af9..84f196b3bb625 100644 --- a/llama.h +++ b/llama.h @@ -708,7 +708,7 @@ extern "C" { /// Apply chat template. Inspired by hf apply_chat_template() on python. /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" - /// NOTE: This function only support some known jinja templates. It is not a jinja parser. + /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. /// @param chat Pointer to a list of multiple llama_chat_message /// @param n_msg Number of llama_chat_message in this chat From 973053d8b0d04809836b3339a50f68d9c842de90 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 22 Feb 2024 00:42:09 +0100 Subject: [PATCH 18/51] llama : fix loading models with shared tok_embd and output (#5651) ggml-ci --- llama.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index d763cc80cb4c2..259f2a3a3ea00 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2791,13 +2791,7 @@ struct llama_model_loader { std::vector> read_buf; - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - if (!cur) { - // some tensors may be allocated in a different context - continue; - } - + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { return false; @@ -3722,7 +3716,7 @@ static bool llm_load_tensors( } // create one context per buffer type - size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors; + size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output std::map ctx_map; for (auto & it : buft_layer_count) { struct ggml_init_params params = { @@ -3860,6 +3854,7 @@ static bool llm_load_tensors( } else { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); } } @@ -4396,6 +4391,7 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); const int64_t n_ff = hparams.n_ff; const int64_t n_embd_head_k = hparams.n_embd_head_k; From 4ef245a92a968ba0f18a5adfd41e51980ce4fdf5 Mon Sep 17 00:00:00 2001 From: Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> Date: Thu, 22 Feb 2024 18:15:13 +1000 Subject: [PATCH 19/51] mpt : add optional bias tensors (#5638) Update for MPT with optional bias parameters: to work with PhoGPT and SEA-LION models that were pre-trained with 'bias'. --- llama.cpp | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/llama.cpp b/llama.cpp index 259f2a3a3ea00..9cae8c761f3ac 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4054,6 +4054,8 @@ static bool llm_load_tensors( // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } @@ -4063,14 +4065,23 @@ static bool llm_load_tensors( auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false); // AWQ ScaleActivation layer layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); @@ -6171,7 +6182,7 @@ struct llm_build_context { attn_norm = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, - NULL, + model.layers[il].attn_norm_b, LLM_NORM, cb, il); cb(attn_norm, "attn_norm", il); @@ -6181,6 +6192,11 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); + + if (model.layers[il].bqkv){ + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } if (hparams.f_clamp_kqv > 0.0f) { cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -6198,7 +6214,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, NULL, + model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6211,13 +6227,13 @@ struct llm_build_context { { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, - NULL, + model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -6234,7 +6250,7 @@ struct llm_build_context { cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, - NULL, + model.output_norm_b, LLM_NORM, cb, -1); cb(cur, "result_norm", -1); From c5688c6250430d2b8e0259efcf26c16dfa4c1f46 Mon Sep 17 00:00:00 2001 From: Alexey Parfenov Date: Thu, 22 Feb 2024 08:27:32 +0000 Subject: [PATCH 20/51] server : clarify some params in the docs (#5640) --- examples/server/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 4b24ee5dc3f28..4b6cd8326efa8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -151,7 +151,7 @@ node index.js `temperature`: Adjust the randomness of the generated text (default: 0.8). - `dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled). + `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled). `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0). @@ -209,7 +209,7 @@ node index.js `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1) - `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false) + `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false) `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) @@ -242,7 +242,7 @@ Notice that each `probs` is an array of length `n_probs`. - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) -- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model` +- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). - `model`: The path to the model loaded with `-m` - `prompt`: The provided `prompt` - `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token From a46f50747b2028f7f9c9883b26bfba12bf92556e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 22 Feb 2024 09:33:24 +0100 Subject: [PATCH 21/51] server : fallback to chatml, add AlphaMonarch chat template (#5628) * server: fallback to chatml * add new chat template * server: add AlphaMonarch to test chat template * server: only check model template if there is no custom tmpl * remove TODO --- examples/server/server.cpp | 15 +++++++++++++++ llama.cpp | 9 +++++++++ tests/test-chat-template.cpp | 23 +++++++++++++++-------- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c84719a0d15d0..369121e885b27 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -400,6 +400,16 @@ struct llama_server_context return true; } + void validate_model_chat_template(server_params & sparams) { + llama_chat_message chat[] = {{"user", "test"}}; + std::vector buf(1); + int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size()); + if (res < 0) { + LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); + sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template + } + } + void initialize() { // create slots all_slots_are_idle = true; @@ -2752,6 +2762,11 @@ int main(int argc, char **argv) LOG_INFO("model loaded", {}); } + if (sparams.chat_template.empty()) { // custom chat template is not supplied + // check if the template comes with the model is supported by us + llama.validate_model_chat_template(sparams); + } + // Middleware for API key validation auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool { // If API key is not set, skip validation diff --git a/llama.cpp b/llama.cpp index 9cae8c761f3ac..055b57e3187f2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12773,6 +12773,15 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } + } else if (tmpl.find("bos_token + message['role']") != std::string::npos) { + // mlabonne/AlphaMonarch-7B template (the is included inside history) + for (auto message : chat) { + std::string bos = (message == chat.front()) ? "" : ""; // skip BOS for first message + ss << bos << message->role << "\n" << message->content << "\n"; + } + if (add_ass) { + ss << "assistant\n"; + } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 9830650d4f8dd..d02b39e144947 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -27,12 +27,20 @@ int main(void) { "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <>\\\\n' + messages[idx]['content'] + '\\\\n<>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}", // bofenghuang/vigogne-2-70b-chat "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\\\n' + system_message + '\\\\n<>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\\\n' + content.strip() + '\\\\n<>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", + // mlabonne/AlphaMonarch-7B + "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}", }; - std::vector expected_substr = { - "<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant", - "[/INST]Hi there[INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", - "[INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", - "[/INST] Hi there [INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + std::vector expected_output = { + // teknium/OpenHermes-2.5-Mistral-7B + "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n", + // mistralai/Mistral-7B-Instruct-v0.2 + "[INST] You are a helpful assistant\nHello [/INST]Hi there[INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + // TheBloke/FusionNet_34Bx2_MoE-AWQ + "[INST] <>\nYou are a helpful assistant\n<>\n\nHello [/INST] Hi there [INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + // bofenghuang/vigogne-2-70b-chat + "[INST] <>\nYou are a helpful assistant\n<>\n\nHello [/INST] Hi there [INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + // mlabonne/AlphaMonarch-7B + "system\nYou are a helpful assistant\nuser\nHello\nassistant\nHi there\nuser\nWho are you\nassistant\n I am an assistant \nuser\nAnother question\nassistant\n", }; std::vector formatted_chat(1024); int32_t res; @@ -43,7 +51,7 @@ int main(void) { for (size_t i = 0; i < templates.size(); i++) { std::string custom_template = templates[i]; - std::string substr = expected_substr[i]; + std::string expected = expected_output[i]; formatted_chat.resize(1024); res = llama_chat_apply_template( nullptr, @@ -57,8 +65,7 @@ int main(void) { formatted_chat.resize(res); std::string output(formatted_chat.data(), formatted_chat.size()); std::cout << output << "\n-------------------------\n"; - // expect the "formatted_chat" to contain pre-defined strings - assert(output.find(substr) != std::string::npos); + assert(output == expected); } return 0; } From 56d03d92be57f5880b9ed94542d87bb6effae31f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 10:35:54 +0200 Subject: [PATCH 22/51] readme : update hot topics --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c1624b9f9a348..3bc512af0602b 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics +- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) - Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631 - Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 From 3a03541cedea474fa9d41214484cc3fbcf468a9e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 13:54:03 +0200 Subject: [PATCH 23/51] minor : fix trailing whitespace (#5638) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 055b57e3187f2..6ab5e1bf4f409 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6192,7 +6192,7 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - + if (model.layers[il].bqkv){ cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); From 4cb4d8b22d4fda971621a68c570ce84d66897c37 Mon Sep 17 00:00:00 2001 From: Someone Date: Thu, 22 Feb 2024 16:32:09 +0000 Subject: [PATCH 24/51] workflows: nix: hardcode cachix ids, build unconditionally (#5663) GitHub does not expose environment and repository variables to PRs coming from forks implies that we've been disabling the Nix CI actions for most PRs. The `if:` also didn't make much sense, because we can always pull from cachix, and there's no point (albeit no risk either) in pushing cache for the untrusted code. --- .github/workflows/nix-ci-aarch64.yml | 7 +++---- .github/workflows/nix-ci.yml | 11 +++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml index 0c6cf5f091528..8d0a3fd7fd313 100644 --- a/.github/workflows/nix-ci-aarch64.yml +++ b/.github/workflows/nix-ci-aarch64.yml @@ -19,7 +19,6 @@ on: jobs: nix-build-aarch64: - if: ${{ vars.CACHIX_NAME != '' }} runs-on: ubuntu-latest steps: - name: Checkout repository @@ -37,8 +36,8 @@ jobs: extra-conf: | extra-platforms = aarch64-linux extra-system-features = nixos-test kvm - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -46,7 +45,7 @@ jobs: uses: cachix/cachix-action@v13 with: authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: ${{ vars.CACHIX_NAME }} + name: llama-cpp - name: Show all output paths run: > nix run github:nix-community/nix-eval-jobs diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index d19c7a576cdf6..01c5a9d5aaca2 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -23,8 +23,8 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} extra-conf: | - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -37,7 +37,6 @@ jobs: --flake ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)" nix-build: - if: ${{ vars.CACHIX_NAME != '' }} strategy: fail-fast: false matrix: @@ -51,8 +50,8 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} extra-conf: | - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -60,7 +59,7 @@ jobs: uses: cachix/cachix-action@v13 with: authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: ${{ vars.CACHIX_NAME }} + name: llama-cpp - name: Build run: > nix run github:Mic92/nix-fast-build From 373ee3fbbabc4c1508eed4f5c3795b23a20939a3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 22 Feb 2024 19:10:21 +0100 Subject: [PATCH 25/51] Add Gemma chat template (#5665) * add gemma chat template * gemma: only apply system_prompt on non-model message --- llama.cpp | 22 ++++++++++++++++++++++ tests/test-chat-template.cpp | 4 ++++ 2 files changed, 26 insertions(+) diff --git a/llama.cpp b/llama.cpp index 6ab5e1bf4f409..40dda265ccc93 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12782,6 +12782,28 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "assistant\n"; } + } else if (tmpl.find("") != std::string::npos) { + // google/gemma-7b-it + std::string system_prompt = ""; + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken + system_prompt = trim(message->content); + continue; + } + // in gemma, "assistant" is "model" + role = role == "assistant" ? "model" : message->role; + ss << "" << role << "\n"; + if (!system_prompt.empty() && role != "model") { + ss << system_prompt << "\n\n"; + system_prompt = ""; + } + ss << trim(message->content) << "\n"; + } + if (add_ass) { + ss << "model\n"; + } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index d02b39e144947..fa2eb577b6e42 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -29,6 +29,8 @@ int main(void) { "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\\\n' + system_message + '\\\\n<>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\\\n' + content.strip() + '\\\\n<>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", // mlabonne/AlphaMonarch-7B "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}", + // google/gemma-7b-it + "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\\n' + message['content'] | trim + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\\n'}}{% endif %}", }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -41,6 +43,8 @@ int main(void) { "[INST] <>\nYou are a helpful assistant\n<>\n\nHello [/INST] Hi there [INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", // mlabonne/AlphaMonarch-7B "system\nYou are a helpful assistant\nuser\nHello\nassistant\nHi there\nuser\nWho are you\nassistant\n I am an assistant \nuser\nAnother question\nassistant\n", + // google/gemma-7b-it + "user\nYou are a helpful assistant\n\nHello\nmodel\nHi there\nuser\nWho are you\nmodel\nI am an assistant\nuser\nAnother question\nmodel\n", }; std::vector formatted_chat(1024); int32_t res; From 5a9e2f60ba3d8362ba17c77ac3092906d49b813f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 20:13:25 +0200 Subject: [PATCH 26/51] py : minor fixes (#5668) --- convert-hf-to-gguf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 9771fccf9ffc1..8630bbf2980c1 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -655,6 +655,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) def write_tensors(self): @@ -1031,7 +1033,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) def set_vocab(self): self._set_vocab_sentencepiece() From 201294ae177b308fb3a99dc504dd6d27e8afa907 Mon Sep 17 00:00:00 2001 From: Someone Date: Thu, 22 Feb 2024 19:44:10 +0000 Subject: [PATCH 27/51] nix: init singularity and docker images (#5056) Exposes a few attributes demonstrating how to build [singularity](https://docs.sylabs.io/guides/latest/user-guide/)/[apptainer](https://apptainer.org/) and Docker images re-using llama.cpp's Nix expression. Built locally on `x86_64-linux` with `nix build github:someoneserge/llama.cpp/feat/nix/images#llamaPackages.{docker,docker-min,sif,llama-cpp}` and it's fast and effective. --- .devops/nix/docker.nix | 37 +++++++++++++++++++++++++++++++++++++ .devops/nix/scope.nix | 3 +++ .devops/nix/sif.nix | 27 +++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 .devops/nix/docker.nix create mode 100644 .devops/nix/sif.nix diff --git a/.devops/nix/docker.nix b/.devops/nix/docker.nix new file mode 100644 index 0000000000000..d607b4575772c --- /dev/null +++ b/.devops/nix/docker.nix @@ -0,0 +1,37 @@ +{ + lib, + dockerTools, + buildEnv, + llama-cpp, + interactive ? true, + coreutils, +}: + +# A tar that can be fed into `docker load`: +# +# $ nix build .#llamaPackages.docker +# $ docker load < result + +# For details and variations cf. +# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage +# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922 +# - https://nixery.dev/ + +# Approximate (compressed) sizes, at the time of writing, are: +# +# .#llamaPackages.docker: 125M; +# .#llamaPackagesCuda.docker: 537M; +# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M. + +dockerTools.buildLayeredImage { + name = llama-cpp.pname; + tag = "latest"; + + contents = + [ llama-cpp ] + ++ lib.optionals interactive [ + coreutils + dockerTools.binSh + dockerTools.caCertificates + ]; +} diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix index d295995a4b96b..78530c9e8a230 100644 --- a/.devops/nix/scope.nix +++ b/.devops/nix/scope.nix @@ -12,5 +12,8 @@ lib.makeScope newScope ( self: { inherit llamaVersion; llama-cpp = self.callPackage ./package.nix { }; + docker = self.callPackage ./docker.nix { }; + docker-min = self.callPackage ./docker.nix { interactive = false; }; + sif = self.callPackage ./sif.nix { }; } ) diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix new file mode 100644 index 0000000000000..7535ca0f3088e --- /dev/null +++ b/.devops/nix/sif.nix @@ -0,0 +1,27 @@ +{ + lib, + singularity-tools, + llama-cpp, + bashInteractive, + interactive ? false, +}: + +let + optionalInt = cond: x: if cond then x else 0; +in +singularity-tools.buildImage rec { + inherit (llama-cpp) name; + contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ]; + + # These are excessive (but safe) for most variants. Building singularity + # images requires superuser privileges, so we build them inside a VM in a + # writable image of pre-determined size. + # + # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846 + # + # Expected image sizes: + # - cpu/blas: 150M, + # - cuda, all gencodes: 560M, + diskSize = 4096 + optionalInt llama-cpp.useRocm 16384; + memSize = diskSize; +} From efd56b1c2139d50b9b4381a212feb75d69598fda Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 18:31:40 +0200 Subject: [PATCH 28/51] ggml : 32-bit arm compat (whisper/1891) * ggml : 32-bit arm compat * ggml : add ggml_vqtbl1q_s8 impl * ggml : cont --- ggml-quants.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 6336538f0e99e..8917c8af14255 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -438,6 +438,30 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { return res; } +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + #else #define ggml_int16x8x2_t int16x8x2_t @@ -451,6 +475,7 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { #define ggml_vld1q_u8_x4 vld1q_u8_x4 #define ggml_vld1q_s8_x2 vld1q_s8_x2 #define ggml_vld1q_s8_x4 vld1q_s8_x4 +#define ggml_vqtbl1q_s8 vqtbl1q_s8 #endif @@ -9333,7 +9358,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const uint16_t gindex[8]; uint16x8x2_t vindex; int8x16x4_t q1b; - int8x16x4_t q8b; + ggml_int8x16x4_t q8b; uint16x8x4_t scales; int32x4x2_t sumi; int32x4x2_t dotq; @@ -9506,10 +9531,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * q8b.val[2] = vld1q_s8(y[ib+1].qs); q8b.val[3] = vld1q_s8(y[ib+1].qs + 16); - q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b)); - q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); - q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b)); - q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); + q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b)); + q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); + q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b)); + q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); From 334f76fa385ed81095165e5ae068756214893901 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 23:21:05 +0200 Subject: [PATCH 29/51] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index bbbf88d9d6ff7..59de343706f2a 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -30805514e1bf389a59d30a54a0525cbdc30d5bd1 +8cdf783f288a98eddf521b0ab1b4d405be9e18ba From 7e4f339c404dbe029d4a117c03b37a9bf646cf0e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 23:21:39 +0200 Subject: [PATCH 30/51] ggml : always define ggml_fp16_t as uint16_t (#5666) * ggml : always define ggml_fp16_t as uint16_t ggml-ci * ggml : cont ggml-ci * ggml : cont * ggml : cont ggml-ci * ggml : cont ggml-ci * cuda : no longer ggml headers last ggml-ci * ggml : fix q6_K FP16 -> FP32 conversion ggml-ci * ggml : more FP16 -> FP32 conversion fixes ggml-ci --- ggml-cuda.cu | 9 ++++----- ggml-impl.h | 27 ++++++++++++++++++++------- ggml-quants.c | 30 +++++++++++++++--------------- ggml.c | 6 +++--- ggml.h | 6 ------ 5 files changed, 42 insertions(+), 36 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e7c211d7d6087..b0e454e025ec4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,3 +1,7 @@ +#include "ggml-cuda.h" +#include "ggml.h" +#include "ggml-backend-impl.h" + #include #include #include @@ -121,11 +125,6 @@ #endif // defined(GGML_USE_HIPBLAS) -// ggml-cuda need half type so keep ggml headers include at last -#include "ggml-cuda.h" -#include "ggml.h" -#include "ggml-backend-impl.h" - #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CC_PASCAL 600 diff --git a/ggml-impl.h b/ggml-impl.h index 19df66bceee4a..c5637e4d45d8c 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -53,11 +53,23 @@ extern "C" { // #include -#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) -#define GGML_COMPUTE_FP32_TO_FP16(x) (x) +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; +} -#define GGML_FP16_TO_FP32(x) ((float) (x)) -#define GGML_FP32_TO_FP16(x) (x) +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __fp16 tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; +} #else @@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16]; // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) - +#if !defined(GGML_FP16_TO_FP32) inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { uint16_t s; memcpy(&s, &f, sizeof(uint16_t)); @@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { } #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif +#if !defined(GGML_FP32_TO_FP16) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif #define GGML_HASHTABLE_FULL ((size_t)-1) diff --git a/ggml-quants.c b/ggml-quants.c index 8917c8af14255..b15977f53e2f3 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -5654,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; - const float dmin = -y[i].d * (float)x[i].dmin; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -5804,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; - const float dmin = -y[i].d * (float)x[i].dmin; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -6458,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1)); q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2)); @@ -6660,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); @@ -7163,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r aux16[1] = (a[0] >> 4) & 0x0f0f; const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]); - sum_mins += y[i].d * (float)x[i].d[1] * summi; + sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi; - const float d = y[i].d * (float)x[i].d[0]; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]); const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); @@ -7823,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const int8_t * sc = x[i].scales; const uint8_t * restrict q5 = x[i].qs; @@ -7965,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const int8_t * sc = x[i].scales; const uint8_t * restrict q5 = x[i].qs; @@ -8533,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d_all = (float)x[i].d; + const float d_all = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -8704,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d_all = (float)x[i].d; + const float d_all = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -9523,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * float sumf = 0; for (int ib = 0; ib < nb; ib += 2) { - q4bits.val[0] = vld1q_u8(x[ib+0].qs); q4bits.val[1] = vld1q_u8(x[ib+1].qs); q8b.val[0] = vld1q_s8(y[ib+0].qs); @@ -9539,8 +9538,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); - sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2); - + sumf += + GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) + + GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2); } *s = sumf; diff --git a/ggml.c b/ggml.c index 5b9fa741a6479..d710fe702ddbd 100644 --- a/ggml.c +++ b/ggml.c @@ -323,7 +323,7 @@ float ggml_table_f32_f16[1 << 16]; // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API float ggml_fp16_to_fp32(ggml_fp16_t x) { - return (float) GGML_FP16_TO_FP32(x); + return GGML_FP16_TO_FP32(x); } ggml_fp16_t ggml_fp32_to_fp16(float x) { @@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16x8 float16x8_t #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) #define GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define GGML_F16x8_LOAD vld1q_f16 + #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) #define GGML_F16x8_STORE vst1q_f16 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_ADD vaddq_f16 @@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F32Cx4 float32x4_t #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) + #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32Cx4_ADD vaddq_f32 diff --git a/ggml.h b/ggml.h index bed7a36a0ee6a..37eff627928e8 100644 --- a/ggml.h +++ b/ggml.h @@ -315,13 +315,7 @@ extern "C" { #endif -#if defined(__ARM_NEON) && defined(__CUDACC__) - typedef half ggml_fp16_t; -#elif defined(__ARM_NEON) && !defined(_MSC_VER) - typedef __fp16 ggml_fp16_t; -#else typedef uint16_t ggml_fp16_t; -#endif // convert FP16 <-> FP32 GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); From 847eedbdb2d1ebf14ef56eb507d4b4b975510908 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 23:22:48 +0200 Subject: [PATCH 31/51] py : add Gemma conversion from HF models (#5647) * py : add gemma conversion from HF models * Update convert-hf-to-gguf.py Co-authored-by: Aarni Koskela * Update convert-hf-to-gguf.py Co-authored-by: Aarni Koskela * Update convert-hf-to-gguf.py Co-authored-by: Jared Van Bortel --------- Co-authored-by: Aarni Koskela Co-authored-by: Jared Van Bortel --- convert-hf-to-gguf.py | 60 +++++++++++++++++++++++++++++++++++++++++++ llama.cpp | 3 +++ 2 files changed, 63 insertions(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 8630bbf2980c1..481198dad042c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -218,6 +218,8 @@ def from_model_architecture(model_architecture): return BertModel if model_architecture == "NomicBertModel": return NomicBertModel + if model_architecture == "GemmaForCausalLM": + return GemmaModel return Model def _is_model_safetensors(self) -> bool: @@ -277,6 +279,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: return gguf.MODEL_ARCH.BERT if arch == "NomicBertModel": return gguf.MODEL_ARCH.NOMIC_BERT + if arch == "GemmaForCausalLM": + return gguf.MODEL_ARCH.GEMMA raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -1786,6 +1790,62 @@ def get_tensors(self): yield name, data +class GemmaModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + ###### CONVERSION LOGIC ###### diff --git a/llama.cpp b/llama.cpp index 40dda265ccc93..7770fa0e8f6fa 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7450,6 +7450,7 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -7491,6 +7492,7 @@ struct llm_build_context { n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); cb(Qcur, "Qcur_scaled", il); @@ -7505,6 +7507,7 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); cb(cur, "kqv_out", il); } + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); From 96633eeca1265ed03e57230de54032041c58f9cd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 23:23:46 +0200 Subject: [PATCH 32/51] gemma : use more bits for the token_embd.weight tensor (#5650) * gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type --- llama.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 7770fa0e8f6fa..2ebd40df234f0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty return std::make_pair(i_layer, n_layer); }; - if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { + // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings + // with the quantization of the output tensor + if (name == tn(LLM_TENSOR_OUTPUT, "weight") || + (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) { int nx = tensor->ne[0]; if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; From 15499eb94227401bdc8875da6eb85c15d37068f7 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 22 Feb 2024 17:05:23 -0500 Subject: [PATCH 33/51] mpt : do not duplicate token_embd.weight on disk (#5670) --- convert-hf-to-gguf.py | 5 ----- llama.cpp | 6 ++++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 481198dad042c..9bdfce07ab7db 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -622,11 +622,6 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - self.gguf_writer.add_tensor("output.weight", data) - class OrionModel(Model): def set_vocab(self): diff --git a/llama.cpp b/llama.cpp index 2ebd40df234f0..37477e6ef3c44 100644 --- a/llama.cpp +++ b/llama.cpp @@ -509,7 +509,6 @@ static std::map> LLM_TENSOR_NAMES = { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, @@ -4056,7 +4055,10 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + // same as tok_embd, duplicated to allow offloading + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); } for (int i = 0; i < n_layer; ++i) { From 54fbcd2ce6c48c9e22eca6fbf9e53fb68c3e72ea Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Fri, 23 Feb 2024 13:39:14 -0500 Subject: [PATCH 34/51] convert : fix missing ftype for gemma (#5690) --- convert-hf-to-gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 9bdfce07ab7db..32d54b45f3325 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1803,6 +1803,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(hparams["head_dim"]) self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) From fd43d66f46ee3b5345fb8a74a252d86ccd34a409 Mon Sep 17 00:00:00 2001 From: AlpinDale <52078762+AlpinDale@users.noreply.github.com> Date: Fri, 23 Feb 2024 19:31:54 +0000 Subject: [PATCH 35/51] server : add KV cache quantization options (#5684) --- examples/server/server.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 369121e885b27..524d0ada33ab0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1948,6 +1948,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" -spf FNAME, --system-prompt-file FNAME\n"); printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); + printf(" -ctk TYPE, --cache-type-k TYPE\n"); + printf(" KV cache data type for K (default: f16)\n"); + printf(" -ctv TYPE, --cache-type-v TYPE\n"); + printf(" KV cache data type for V (default: f16)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --log-disable disables logging to a file.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); @@ -2386,6 +2390,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, ); llama.process_system_prompt_data(json::parse(systm_content)); } + else if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + } + else if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; + } else if(arg == "--mmproj") { if (++i >= argc) From 525213d2f5da1eaf4b922b6b792cb52b2c613368 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 24 Feb 2024 12:28:55 +0100 Subject: [PATCH 36/51] server: init functional tests (#5566) * server: tests: init scenarios - health and slots endpoints - completion endpoint - OAI compatible chat completion requests w/ and without streaming - completion multi users scenario - multi users scenario on OAI compatible endpoint with streaming - multi users with total number of tokens to predict exceeds the KV Cache size - server wrong usage scenario, like in Infinite loop of "context shift" #3969 - slots shifting - continuous batching - embeddings endpoint - multi users embedding endpoint: Segmentation fault #5655 - OpenAI-compatible embeddings API - tokenize endpoint - CORS and api key scenario * server: CI GitHub workflow --------- Co-authored-by: Georgi Gerganov --- .github/ISSUE_TEMPLATE/bug.md | 2 + .github/workflows/server.yml | 127 ++++ examples/server/README.md | 6 + examples/server/server.cpp | 36 +- examples/server/tests/README.md | 46 ++ examples/server/tests/features/environment.py | 67 ++ examples/server/tests/features/issues.feature | 36 + .../server/tests/features/parallel.feature | 77 ++ .../server/tests/features/security.feature | 50 ++ examples/server/tests/features/server.feature | 69 ++ examples/server/tests/features/steps/steps.py | 709 ++++++++++++++++++ .../tests/features/wrong_usages.feature | 21 + examples/server/tests/requirements.txt | 3 + examples/server/tests/tests.sh | 12 + 14 files changed, 1243 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/server.yml create mode 100644 examples/server/tests/README.md create mode 100644 examples/server/tests/features/environment.py create mode 100644 examples/server/tests/features/issues.feature create mode 100644 examples/server/tests/features/parallel.feature create mode 100644 examples/server/tests/features/security.feature create mode 100644 examples/server/tests/features/server.feature create mode 100644 examples/server/tests/features/steps/steps.py create mode 100644 examples/server/tests/features/wrong_usages.feature create mode 100644 examples/server/tests/requirements.txt create mode 100755 examples/server/tests/tests.sh diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md index ce69e6395daae..49812832ca542 100644 --- a/.github/ISSUE_TEMPLATE/bug.md +++ b/.github/ISSUE_TEMPLATE/bug.md @@ -7,3 +7,5 @@ assignees: '' --- Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. + +If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests). diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml new file mode 100644 index 0000000000000..ed27dc528fb61 --- /dev/null +++ b/.github/workflows/server.yml @@ -0,0 +1,127 @@ +# Server build and tests +name: Server + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + - test/server-add-ci-test # FIXME remove + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] + pull_request: + types: [opened, synchronize, reopened] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] + +jobs: + server: + runs-on: ubuntu-latest + + strategy: + matrix: + build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan] + sanitizer: [ADDRESS, THREAD, UNDEFINED] + build_type: [Debug, Release] + include: + - build: 'noavx' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF' + image: ubuntu:latest + - build: 'avx2' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' + image: ubuntu:latest + - build: 'avx' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF' + image: ubuntu:latest + - build: 'avx512' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON' + image: ubuntu:latest + experimental: true + - build: 'cublas' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON' + image: nvidia/cuda:12.3.1-devel-ubuntu22.04 + arch_not_available: true # require nvidia docker engine + - build: 'clblast' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON' + image: ubuntu:latest + arch_not_available: true + - build: 'openblas' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS' + image: ubuntu:latest + - build: 'kompute' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' + image: ubuntu:latest + arch_not_available: true + - build: 'vulkan' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON' + image: ubuntu:latest + arch_not_available: true + + container: + image: ${{ matrix.image }} + ports: + - 8888 + options: --cpus 4 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Dependencies + id: depends + run: | + apt-get update + apt-get -y install \ + build-essential \ + pkg-config \ + git \ + cmake \ + python3-pip \ + wget \ + psmisc + + - name: Download CLBlast + id: get_clblast + if: ${{ matrix.build == 'clblast' }} + run: | + apt install -y libclblast-dev + + - name: Download OpenBLAS + id: get_openblas + if: ${{ matrix.build == 'openblas' }} + run: | + apt-get -y install libopenblas-dev + + - name: Install Vulkan SDK + id: get_vulkan + if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }} + run: | + wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + apt-get update + apt-get -y install vulkan-sdk + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }} + cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server + + - name: Tests dependencies + id: test_dependencies + run: | + pip install -r examples/server/tests/requirements.txt + + - name: Download models + id: download_models + run: | + cd examples/server/tests + ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf + + - name: Tests + id: server_integration_test + continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }} + run: | + cd examples/server/tests + PORT=8888 ./tests.sh diff --git a/examples/server/README.md b/examples/server/README.md index 4b6cd8326efa8..0c43ac4c97cba 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -98,6 +98,12 @@ curl --request POST \ --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' ``` +## Advanced testing + +We implemented a [server test framework](./tests/README.md) using human-readable scenario. + +*Before submitting an issue, please try to reproduce it with this format.* + ## Node JS Test You need to have [Node.js](https://nodejs.org/en) installed. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 524d0ada33ab0..9fb436c2a18ec 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1410,11 +1410,6 @@ struct llama_server_context int n_processing_slots = 0; for (llama_client_slot &slot: slots) { - if (slot.available()) { - n_idle_slots++; - } else { - n_processing_slots++; - } json slot_data = get_formated_generation(slot); slot_data["id"] = slot.id; slot_data["task_id"] = slot.task_id; @@ -1429,6 +1424,11 @@ struct llama_server_context {"stopped_limit", slot.stopped_limit}, {"stopping_word", slot.stopping_word}, }; + if (slot_data["state"] == IDLE) { + n_idle_slots++; + } else { + n_processing_slots++; + } slots_data.push_back(slot_data); } LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); @@ -2748,19 +2748,6 @@ int main(int argc, char **argv) log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; } - LOG_INFO("HTTP server listening", log_data); - // run the HTTP server in a thread - see comment below - std::thread t([&]() - { - if (!svr.listen_after_bind()) - { - state.store(SERVER_STATE_ERROR); - return 1; - } - - return 0; - }); - // load the model if (!llama.load_model(params)) { @@ -3228,6 +3215,19 @@ int main(int argc, char **argv) }*/ //); + LOG_INFO("HTTP server listening", log_data); + // run the HTTP server in a thread - see comment below + std::thread t([&]() + { + if (!svr.listen_after_bind()) + { + state.store(SERVER_STATE_ERROR); + return 1; + } + + return 0; + }); + llama.queue_tasks.on_new_task(std::bind( &llama_server_context::process_single_task, &llama, std::placeholders::_1)); llama.queue_tasks.on_finish_multitask(std::bind( diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md new file mode 100644 index 0000000000000..e44c5c286601f --- /dev/null +++ b/examples/server/tests/README.md @@ -0,0 +1,46 @@ +# Server tests + +Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/): + * [issues.feature](./features/issues.feature) Pending issues scenario + * [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests + * [security.feature](./features/security.feature) Security, CORS and API Key + * [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc... + +Tests target GitHub workflows job runners with 4 vCPU. + +Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client. + +Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`. + +### Install dependencies +`pip install -r requirements.txt` + +### Run tests +1. Build the server +```shell +cd ../../.. +mkdir build +cd build +cmake ../ +cmake --build . --target server +``` +2. download required models: + 1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf` +3. Start the test: `./tests.sh` + +It's possible to override some scenario steps values with environment variables: + - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080` + - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` + - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose` + +### Run @bug, @wip or @wrong_usage annotated scenario + +Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope. +- `@bug` annotation aims to link a scenario with a GitHub issue. +- `@wrong_usage` are meant to show user issue that are actually an expected behavior +- `@wip` to focus on a scenario working in progress + +To run a scenario annotated with `@bug`, start: +`DEBUG=ON ./tests.sh --no-skipped --tags bug` + +After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py new file mode 100644 index 0000000000000..13cc841017f62 --- /dev/null +++ b/examples/server/tests/features/environment.py @@ -0,0 +1,67 @@ +import os +import socket +import subprocess +import time +from contextlib import closing +from signal import SIGKILL + + +def before_scenario(context, scenario): + print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m") + port = 8080 + if 'PORT' in os.environ: + port = int(os.environ['PORT']) + if is_server_listening("localhost", port): + assert False, "Server already started" + + +def after_scenario(context, scenario): + if scenario.status == "failed": + if 'GITHUB_ACTIONS' in os.environ: + print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") + if os.path.isfile('llama.log'): + with closing(open('llama.log', 'r')) as f: + for line in f: + print(line) + if not is_server_listening(context.server_fqdn, context.server_port): + print("\x1b[33;101mERROR: Server stopped listening\x1b[0m") + + if not pid_exists(context.server_process.pid): + assert False, f"Server not running pid={context.server_process.pid} ..." + + print(f"stopping server pid={context.server_process.pid} ...") + context.server_process.kill() + # Wait few for socket to free up + time.sleep(0.05) + + attempts = 0 + while is_server_listening(context.server_fqdn, context.server_port): + print(f"stopping server pid={context.server_process.pid} ...") + os.kill(context.server_process.pid, SIGKILL) + time.sleep(0.1) + attempts += 1 + if attempts > 5: + print(f"Server dangling exits, killing all {context.server_path} ...") + process = subprocess.run(['killall', '-9', context.server_path], + stderr=subprocess.PIPE, + universal_newlines=True) + print(process) + + +def is_server_listening(server_fqdn, server_port): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + result = sock.connect_ex((server_fqdn, server_port)) + return result == 0 + + +def pid_exists(pid): + """Check whether pid exists in the current process table.""" + import errno + if pid < 0: + return False + try: + os.kill(pid, 0) + except OSError as e: + return e.errno == errno.EPERM + else: + return True diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature new file mode 100644 index 0000000000000..542006d9a8df2 --- /dev/null +++ b/examples/server/tests/features/issues.feature @@ -0,0 +1,36 @@ +# List of ongoing issues +@bug +Feature: Issues + # Issue #5655 + Scenario: Multi users embeddings + Given a server listening on localhost:8080 + And a model file stories260K.gguf + And a model alias tinyllama-2 + And 42 as server seed + And 64 KV cache size + And 2 slots + And continuous batching + And embeddings extraction + Then the server is starting + Then the server is healthy + + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + Given concurrent embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature new file mode 100644 index 0000000000000..802d624ffc9a3 --- /dev/null +++ b/examples/server/tests/features/parallel.feature @@ -0,0 +1,77 @@ +@llama.cpp +Feature: Parallel + + Background: Server startup + Given a server listening on localhost:8080 + And a model file stories260K.gguf + And a model alias tinyllama-2 + And 42 as server seed + And 64 KV cache size + And 2 slots + And continuous batching + Then the server is starting + Then the server is healthy + + Scenario Outline: Multi users completion + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And max tokens to predict + Given concurrent completion requests + Then the server is busy + Then the server is idle + And all slots are idle + Then all prompts are predicted with tokens + Examples: + | n_predict | + | 128 | + + Scenario Outline: Multi users OAI completions compatibility + Given a system prompt You are a writer. + And a model tinyllama-2 + Given a prompt: + """ + Write a very long book. + """ + And a prompt: + """ + Write another a poem. + """ + And max tokens to predict + And streaming is + Given concurrent OAI completions requests + Then the server is busy + Then the server is idle + Then all prompts are predicted with tokens + Examples: + | streaming | n_predict | + | disabled | 128 | + | enabled | 64 | + + Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969 + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + And 128 max tokens to predict + Given concurrent completion requests + Then the server is busy + Then the server is idle + Then all prompts are predicted diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature new file mode 100644 index 0000000000000..db06d39775c05 --- /dev/null +++ b/examples/server/tests/features/security.feature @@ -0,0 +1,50 @@ +@llama.cpp +Feature: Security + + Background: Server startup with an api key defined + Given a server listening on localhost:8080 + And a model file stories260K.gguf + And a server api key llama.cpp + Then the server is starting + Then the server is healthy + + Scenario Outline: Completion with some user api key + Given a prompt test + And a user api key + And 4 max tokens to predict + And a completion request with api error + + Examples: Prompts + | api_key | api_error | + | llama.cpp | no | + | llama.cpp | no | + | hackeme | raised | + | | raised | + + Scenario Outline: OAI Compatibility + Given a system prompt test + And a user prompt test + And a model test + And 2 max tokens to predict + And streaming is disabled + And a user api key + Given an OAI compatible chat completions request with api error + + Examples: Prompts + | api_key | api_error | + | llama.cpp | no | + | llama.cpp | no | + | hackme | raised | + + + Scenario Outline: CORS Options + When an OPTIONS request is sent from + Then CORS header is set to + + Examples: Headers + | origin | cors_header | cors_header_value | + | localhost | Access-Control-Allow-Origin | localhost | + | web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr | + | origin | Access-Control-Allow-Credentials | true | + | web.mydomain.fr | Access-Control-Allow-Methods | POST | + | web.mydomain.fr | Access-Control-Allow-Headers | * | diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature new file mode 100644 index 0000000000000..fedcfe5aef1b3 --- /dev/null +++ b/examples/server/tests/features/server.feature @@ -0,0 +1,69 @@ +@llama.cpp +Feature: llama.cpp server + + Background: Server startup + Given a server listening on localhost:8080 + And a model file stories260K.gguf + And a model alias tinyllama-2 + And 42 as server seed + # KV Cache corresponds to the total amount of tokens + # that can be stored across all independent sequences: #4130 + # see --ctx-size and #5568 + And 32 KV cache size + And 1 slots + And embeddings extraction + And 32 server max tokens to predict + Then the server is starting + Then the server is healthy + + Scenario: Health + Then the server is ready + And all slots are idle + + Scenario Outline: Completion + Given a prompt + And max tokens to predict + And a completion request with no api error + Then tokens are predicted matching + + Examples: Prompts + | prompt | n_predict | re_content | n_predicted | + | I believe the meaning of life is | 8 | read | 8 | + | Write a joke about AI | 64 | (parkfriendsscared)+ | 32 | + + Scenario Outline: OAI Compatibility + Given a model + And a system prompt + And a user prompt + And max tokens to predict + And streaming is + Given an OAI compatible chat completions request with no api error + Then tokens are predicted matching + + Examples: Prompts + | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming | + | llama-2 | Book | What is the best book | 8 | (Momwhat)+ | 8 | disabled | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thankshappybird)+ | 32 | enabled | + + Scenario: Embedding + When embeddings are computed for: + """ + What is the capital of Bulgaria ? + """ + Then embeddings are generated + + Scenario: OAI Embeddings compatibility + Given a model tinyllama-2 + When an OAI compatible embeddings computation request for: + """ + What is the capital of Spain ? + """ + Then embeddings are generated + + + Scenario: Tokenize / Detokenize + When tokenizing: + """ + What is the capital of France ? + """ + Then tokens can be detokenize diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py new file mode 100644 index 0000000000000..50f2b641e764e --- /dev/null +++ b/examples/server/tests/features/steps/steps.py @@ -0,0 +1,709 @@ +import asyncio +import json +import os +import re +import socket +import subprocess +import time +from contextlib import closing +from re import RegexFlag + +import aiohttp +import openai +from behave import step +from behave.api.async_step import async_run_until_complete + + +@step(u"a server listening on {server_fqdn}:{server_port}") +def step_server_config(context, server_fqdn, server_port): + context.server_fqdn = server_fqdn + context.server_port = int(server_port) + if 'PORT' in os.environ: + context.server_port = int(os.environ['PORT']) + print(f"$PORT set, overriding server port with to {context.server_port}") + + context.base_url = f'http://{context.server_fqdn}:{context.server_port}' + + context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON' + context.model_alias = None + context.n_ctx = None + context.n_predict = None + context.n_server_predict = None + context.n_slots = None + context.server_api_key = None + context.server_continuous_batching = False + context.server_embeddings = False + context.server_seed = None + context.user_api_key = None + + context.tasks_result = [] + context.concurrent_tasks = [] + context.prompts = [] + + +@step(u'a model file {model_file}') +def step_model_file(context, model_file): + context.model_file = model_file + + +@step(u'a model alias {model_alias}') +def step_model_alias(context, model_alias): + context.model_alias = model_alias + + +@step(u'{seed} as server seed') +def step_seed(context, seed): + context.server_seed = int(seed) + + +@step(u'{n_ctx} KV cache size') +def step_n_ctx(context, n_ctx): + context.n_ctx = int(n_ctx) + + +@step(u'{n_slots} slots') +def step_n_slots(context, n_slots): + context.n_slots = int(n_slots) + + +@step(u'{n_predict} server max tokens to predict') +def step_server_n_predict(context, n_predict): + context.n_server_predict = int(n_predict) + + +@step(u'continuous batching') +def step_server_continuous_batching(context): + context.server_continuous_batching = True + + +@step(u'embeddings extraction') +def step_server_embeddings(context): + context.server_embeddings = True + + +@step(u"the server is starting") +def step_start_server(context): + start_server_background(context) + attempts = 0 + while True: + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + result = sock.connect_ex((context.server_fqdn, context.server_port)) + if result == 0: + print("\x1b[33;46mserver started!\x1b[0m") + return + attempts += 1 + if attempts > 20: + assert False, "server not started" + print(f"waiting for server to start, connect error code = {result}...") + time.sleep(0.1) + + +@step(u"the server is {expecting_status}") +@async_run_until_complete +async def step_wait_for_the_server_to_be_started(context, expecting_status): + match expecting_status: + case 'healthy': + await wait_for_health_status(context, context.base_url, 200, 'ok') + + case 'ready' | 'idle': + await wait_for_health_status(context, context.base_url, 200, 'ok', + params={'fail_on_no_slot': 0, 'include_slots': 0}, + slots_idle=context.n_slots, + slots_processing=0, + expected_slots=[{'id': slot_id, 'state': 0} + for slot_id in range(context.n_slots)]) + case 'busy': + await wait_for_health_status(context, context.base_url, 503, + 'no slot available', + params={'fail_on_no_slot': 0, 'include_slots': 0}, + slots_idle=0, + slots_processing=context.n_slots, + expected_slots=[{'id': slot_id, 'state': 1} + for slot_id in range(context.n_slots)]) + case _: + assert False, "unknown status" + + +@step(u'all slots are {expected_slot_status_string}') +@async_run_until_complete +async def step_all_slots_status(context, expected_slot_status_string): + match expected_slot_status_string: + case 'idle': + expected_slot_status = 0 + case 'busy': + expected_slot_status = 1 + case _: + assert False, "unknown status" + + expected_slots = [{'id': slot_id, 'state': expected_slot_status} + for slot_id in range(context.n_slots)] + await request_slots_status(context, expected_slots) + + +@step(u'a completion request with {api_error} api error') +@async_run_until_complete +async def step_request_completion(context, api_error): + expect_api_error = api_error == 'raised' + completion = await request_completion(context.prompts.pop(), + context.base_url, + debug=context.debug, + n_predict=context.n_predict, + server_seed=context.server_seed, + expect_api_error=expect_api_error, + user_api_key=context.user_api_key) + context.tasks_result.append(completion) + if context.debug: + print(f"Completion response: {completion}") + if expect_api_error: + assert completion == 401, f"completion must be an 401 status code: {completion}" + + +@step(u'{predicted_n} tokens are predicted matching {re_content}') +def step_n_tokens_predicted_with_content(context, predicted_n, re_content): + assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content) + + +@step(u'{predicted_n} tokens are predicted') +def step_n_tokens_predicted(context, predicted_n): + assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n)) + + +@step(u'a user prompt {user_prompt}') +def step_user_prompt(context, user_prompt): + context.prompts.append(user_prompt) + + +@step(u'a system prompt {system_prompt}') +def step_system_prompt(context, system_prompt): + context.system_prompt = system_prompt + + +@step(u'a model {model}') +def step_model(context, model): + context.model = model + + +@step(u'{max_tokens} max tokens to predict') +def step_max_tokens(context, max_tokens): + context.n_predict = int(max_tokens) + + +@step(u'streaming is {enable_streaming}') +def step_streaming(context, enable_streaming): + context.enable_streaming = enable_streaming == 'enabled' + + +@step(u'a user api key {user_api_key}') +def step_user_api_key(context, user_api_key): + context.user_api_key = user_api_key + + +@step(u'no user api key') +def step_no_user_api_key(context): + context.user_api_key = None + + +@step(u'a user api key ') +def step_no_user_api_key_space(context): + context.user_api_key = None + + +@step(u'a server api key {server_api_key}') +def step_server_api_key(context, server_api_key): + context.server_api_key = server_api_key + + +@step(u'an OAI compatible chat completions request with {api_error} api error') +@async_run_until_complete +async def step_oai_chat_completions(context, api_error): + if context.debug: + print(f"Submitting OAI compatible completions request...") + expect_api_error = api_error == 'raised' + completion = await oai_chat_completions(context.prompts.pop(), + context.system_prompt, + context.base_url, + False, + model=context.model if hasattr(context, 'model') else None, + + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + + server_seed=context.server_seed + if hasattr(context, 'server_seed') else None, + + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None, + + expect_api_error=expect_api_error) + context.tasks_result.append(completion) + if context.debug: + print(f"Completion response: {completion}") + if expect_api_error: + assert completion == 401, f"completion must be an 401 status code: {completion}" + + if context.debug: + print(f"Completion response: {completion}") + + +@step(u'a prompt') +def step_a_prompt(context): + context.prompts.append(context.text) + + +@step(u'a prompt {prompt}') +def step_a_prompt_prompt(context, prompt): + context.prompts.append(prompt) + + +@step(u'concurrent completion requests') +@async_run_until_complete() +async def step_concurrent_completion_requests(context): + await concurrent_completion_requests(context, + request_completion, + # prompt is inserted automatically + context.base_url, + debug=context.debug, + n_predict=context.n_predict if hasattr(context, 'n_predict') else None, + server_seed=context.server_seed if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key if hasattr(context, + 'user_api_key') else None) + + +@step(u'concurrent OAI completions requests') +@async_run_until_complete +async def step_oai_chat_completions(context): + await concurrent_completion_requests(context, oai_chat_completions, + # user_prompt is inserted automatically + context.system_prompt, + context.base_url, + True, # async_client + model=context.model + if hasattr(context, 'model') else None, + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + server_seed=context.server_seed + if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None) + + +@step(u'all prompts are predicted') +@async_run_until_complete +async def step_all_prompts_are_predicted(context): + await all_prompts_are_predicted(context) + + +@step(u'all prompts are predicted with {n_predict} tokens') +@async_run_until_complete +async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict): + expected_predicted_n = int(n_predict) + await all_prompts_are_predicted(context, expected_predicted_n) + + +async def all_prompts_are_predicted(context, expected_predicted_n=None): + n_completions = await gather_tasks_results(context) + assert n_completions > 0 + for i in range(n_completions): + assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n) + assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests" + + +@step(u'embeddings are computed for') +@async_run_until_complete +async def step_compute_embedding(context): + content = context.text + base_url = context.base_url + context.embeddings = await request_embedding(content, base_url) + + +@step(u'embeddings are generated') +def step_assert_embeddings(context): + assert_embeddings(context.embeddings) + + +@step(u'an OAI compatible embeddings computation request for') +def step_oai_compute_embedding(context): + openai.api_key = 'nope' # openai client always expects an api_keu + if context.user_api_key is not None: + openai.api_key = context.user_api_key + openai.api_base = f'{context.base_url}/v1' + embeddings = openai.Embedding.create( + model=context.model, + input=context.text, + ) + context.embeddings = embeddings + + +@step(u'concurrent embedding requests') +@async_run_until_complete() +async def step_concurrent_embedding_requests(context): + await concurrent_completion_requests(context, + request_embedding, + # prompt is inserted automatically + context.base_url) + + +@step(u'all embeddings are generated') +@async_run_until_complete() +async def all_embeddings_are_generated(context): + n_embedding_requests = await gather_tasks_results(context) + assert n_embedding_requests > 0 + for i in range(n_embedding_requests): + assert_embeddings(context.tasks_result.pop()) + + +@step(u'tokenizing') +@async_run_until_complete +async def step_tokenize(context): + context.tokenized_text = context.text + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/tokenize', + json={ + "content": context.tokenized_text, + }) as response: + assert response.status == 200 + tokenize_json = await response.json() + context.tokens = tokenize_json['tokens'] + + +@step(u'tokens can be detokenize') +@async_run_until_complete +async def step_detokenize(context): + assert len(context.tokens) > 0 + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/detokenize', + json={ + "tokens": context.tokens, + }) as response: + assert response.status == 200 + detokenize_json = await response.json() + # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15 + assert context.tokenized_text == detokenize_json['content'].strip() + + +@step(u'an OPTIONS request is sent from {origin}') +@async_run_until_complete +async def step_options_request(context, origin): + async with aiohttp.ClientSession() as session: + async with session.options(f'{context.base_url}/v1/chat/completions', + headers={"Origin": origin}) as response: + assert response.status == 200 + context.options_response = response + + +@step(u'CORS header {cors_header} is set to {cors_header_value}') +def step_check_options_header_value(context, cors_header, cors_header_value): + assert context.options_response.headers[cors_header] == cors_header_value + + +async def concurrent_completion_requests(context, f_completion, *args, **kwargs): + n_prompts = len(context.prompts) + if context.debug: + print(f"starting {n_prompts} concurrent completion requests...") + assert n_prompts > 0 + for prompt_no in range(n_prompts): + shifted_args = [context.prompts.pop(), *args] + context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) + await asyncio.sleep(0.1) + + +async def request_completion(prompt, + base_url, + debug=False, + n_predict=None, + server_seed=None, + expect_api_error=None, + user_api_key=None): + if debug: + print(f"Sending completion request: {prompt}") + origin = "my.super.domain" + headers = { + 'Origin': origin + } + if user_api_key is not None: + if debug: + print(f"Set user_api_key: {user_api_key}") + headers['Authorization'] = f'Bearer {user_api_key}' + + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/completion', + json={ + "prompt": prompt, + "n_predict": int(n_predict) if n_predict is not None else -1, + "seed": server_seed if server_seed is not None else 42 + }, + headers=headers) as response: + if expect_api_error is None or not expect_api_error: + assert response.status == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + return await response.json() + else: + return response.status + + +async def oai_chat_completions(user_prompt, + system_prompt, + base_url, + async_client, + debug=False, + model=None, + n_predict=None, + enable_streaming=None, + server_seed=None, + user_api_key=None, + expect_api_error=None): + if debug: + print(f"Sending OAI Chat completions request: {user_prompt}") + # openai client always expects an api key + user_api_key = user_api_key if user_api_key is not None else 'nope' + seed = server_seed if server_seed is not None else 42 + enable_streaming = enable_streaming if enable_streaming is not None else False + payload = { + "messages": [ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": user_prompt, + } + ], + "model": model, + "max_tokens": n_predict, + "stream": enable_streaming, + "seed": seed + } + completion_response = { + 'content': '', + 'timings': { + 'predicted_n': 0 + } + } + if async_client: + origin = 'llama.cpp' + headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/v1/chat/completions', + json=payload, + headers=headers) as response: + if enable_streaming: + assert response.status == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "text/event-stream" + event_received = True + while event_received: + event_received = False + async for line_in_bytes in response.content: + line = line_in_bytes.decode('utf8') + line = line.rstrip('\n').rstrip('\r') + if line == '': + continue + event_data = line.split(': ', 1) + assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```' + chunk_raw = event_data[1] + + chunk = json.loads(chunk_raw) + assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```" + delta = chunk['choices'][0]['delta'] + if 'content' in delta: + completion_response['content'] += delta['content'] + completion_response['timings']['predicted_n'] += 1 + else: + if expect_api_error is None or not expect_api_error: + assert response.status == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "application/json; charset=utf-8" + chat_completion_raw = await response.json() + completion_response = { + 'content': chat_completion_raw['choices'][0]['message'], + 'timings': { + 'predicted_n': chat_completion_raw['usage']['completion_tokens'] + } + } + else: + return response.status + else: + try: + openai.api_key = user_api_key + openai.api_base = f'{base_url}/v1/chat' + chat_completion = openai.Completion.create( + messages=payload['messages'], + model=model, + max_tokens=n_predict, + stream=enable_streaming, + seed=seed + ) + except openai.error.APIError as e: + if expect_api_error is not None and expect_api_error: + return 401 + else: + assert False, f'error raised: {e}' + + if enable_streaming: + for chunk in chat_completion: + assert len(chunk.choices) == 1 + delta = chunk.choices[0].delta + if 'content' in delta: + completion_response['content'] += delta['content'] + completion_response['timings']['predicted_n'] += 1 + else: + assert len(chat_completion.choices) == 1 + completion_response = { + 'content': chat_completion.choices[0].message.content, + 'timings': { + 'predicted_n': chat_completion.usage.completion_tokens + } + } + if debug: + print("OAI response formatted to llama.cpp:", completion_response) + return completion_response + + +async def request_embedding(content, base_url): + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/embedding', + json={ + "content": content, + }) as response: + assert response.status == 200 + response_json = await response.json() + return response_json['embedding'] + + +def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): + content = completion_response['content'] + n_predicted = completion_response['timings']['predicted_n'] + assert len(content) > 0, "no token predicted" + if expected_predicted_n is not None: + assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' + f' {n_predicted} <> {expected_predicted_n}') + if re_content is not None: + re_content = '^.*' + re_content.replace('', '|') + '.*$' + assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), ( + f'invalid tokens predicted:' + f' ```\n{content}\n``` do not match /{re_content}/') + + +async def gather_tasks_results(context): + n_tasks = len(context.concurrent_tasks) + if context.debug: + print(f"Waiting for all {n_tasks} tasks results...") + for task_no in range(n_tasks): + context.tasks_result.append(await context.concurrent_tasks.pop()) + n_completions = len(context.tasks_result) + return n_completions + + +async def wait_for_health_status(context, + base_url, + expected_http_status_code, + expected_health_status, + params=None, + slots_idle=None, + slots_processing=None, + expected_slots=None): + if context.debug: + print(f"Starting checking for health for expected_health_status={expected_health_status}") + timeout = 3 # seconds + interval = 0.5 + counter = 0 + async with aiohttp.ClientSession() as session: + while True: + async with await session.get(f'{base_url}/health', params=params) as health_response: + status_code = health_response.status + health = await health_response.json() + if context.debug: + print(f"HEALTH - response for expected health status='{expected_health_status}' on " + f"'{base_url}/health'?{params} is {health}") + if (status_code == expected_http_status_code + and health['status'] == expected_health_status + and (slots_idle is None or health['slots_idle'] == slots_idle) + and (slots_processing is None or health['slots_processing'] == slots_processing)): + if expected_slots is not None: + assert_slots_status(health['slots'], expected_slots) + return + if (status_code == expected_http_status_code + and health['status'] == expected_health_status + and (slots_idle is None or health['slots_idle'] == slots_idle) + and (slots_processing is None or health['slots_processing'] == slots_processing)): + if expected_slots is not None: + assert_slots_status(health['slots'], expected_slots) + return + await asyncio.sleep(interval) + + counter += interval + if counter >= timeout: + # Sometimes health requests are triggered after completions are predicted + if expected_http_status_code == 503: + if len(context.tasks_result) == 0: + print("\x1b[5;37;43mWARNING: forcing concurrent tasks," + " busy health check missed, probably too fast inference\x1b[0m") + n_completions = await gather_tasks_results(context) + if n_completions > 0: + return + + assert False, 'timeout exceeded' + + +def assert_embeddings(embeddings): + assert len(embeddings) > 0 + embeddings_computed = False + for emb in embeddings: + if emb != 0: + embeddings_computed = True + assert embeddings_computed, f"Embeddings: {embeddings}" + + +async def request_slots_status(context, expected_slots): + async with aiohttp.ClientSession() as session: + async with await session.get(f'{context.base_url}/slots') as slots_response: + assert slots_response.status == 200 + slots = await slots_response.json() + assert_slots_status(slots, expected_slots) + + +def assert_slots_status(slots, expected_slots): + assert len(slots) == len(expected_slots) + for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)): + for key in expected: + assert expected[key] == slot[key], (f"invalid slot {slot_id}" + f" expected[{key}] != slot[{key}]" + f" = {expected[key]} != {slot[key]}") + + +def start_server_background(context): + context.server_path = '../../../build/bin/server' + if 'LLAMA_SERVER_BIN_PATH' in os.environ: + context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] + server_args = [ + '--host', context.server_fqdn, + '--port', context.server_port, + '--model', context.model_file + ] + if context.server_continuous_batching: + server_args.append('--cont-batching') + if context.server_embeddings: + server_args.append('--embedding') + if context.model_alias is not None: + server_args.extend(['--alias', context.model_alias]) + if context.n_ctx is not None: + server_args.extend(['--ctx-size', context.n_ctx]) + if context.n_slots is not None: + server_args.extend(['--parallel', context.n_slots]) + if context.n_server_predict is not None: + server_args.extend(['--n-predict', context.n_server_predict]) + if context.server_api_key is not None: + server_args.extend(['--api-key', context.server_api_key]) + if context.debug: + server_args.append('--verbose') + print(f"starting server with: {context.server_path}", *server_args) + context.server_process = subprocess.Popen( + [str(arg) for arg in [context.server_path, *server_args]], + close_fds=True) + print(f"server pid={context.server_process.pid}") diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature new file mode 100644 index 0000000000000..e228b2371ccce --- /dev/null +++ b/examples/server/tests/features/wrong_usages.feature @@ -0,0 +1,21 @@ +# run with ./test.sh --tags wrong_usage +@wrong_usage +Feature: Wrong usage of llama.cpp server + + #3969 The user must always set --n-predict option + # to cap the number of tokens any completion request can generate + # or pass n_predict/max_tokens in the request. + Scenario: Infinite loop + Given a server listening on localhost:8080 + And a model file stories260K.gguf + # Uncomment below to fix the issue + #And 64 server max tokens to predict + Then the server is starting + Given a prompt: + """ + Go to: infinite loop + """ + # Uncomment below to fix the issue + #And 128 max tokens to predict + Given concurrent completion requests + Then all prompts are predicted diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt new file mode 100644 index 0000000000000..3e51b12dc8207 --- /dev/null +++ b/examples/server/tests/requirements.txt @@ -0,0 +1,3 @@ +aiohttp~=3.9.3 +behave~=1.2.6 +openai~=0.25.0 diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh new file mode 100755 index 0000000000000..17a4e6fc64307 --- /dev/null +++ b/examples/server/tests/tests.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -eu + +if [ $# -lt 1 ] +then + # Start @llama.cpp scenario + behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp +else + behave "$@" +fi + From 4c4cb30736582cacb1a164a9d4bc8e17b1014be7 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sat, 24 Feb 2024 16:23:52 +0200 Subject: [PATCH 37/51] IQ3_S: a much better alternative to Q3_K (#5676) * iq4_nl: squash commits for easier rebase * Basics (quantize, dequantize) * CUDA dequantize and dot product * Slightly faster CUDA dot product (120 t/s) * Switch to 6-bit scales * Scalar dot product * AVX2 dot product * ARM_NEON dot product * Works on metal, but still slow * Slightly better Metal dot product * Another small Metal improvement * Metal dot product is getting there * Faster CUDA dot product * Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided * Report the actual bpw * Add _xs mix that is 4.05 bpw for non-MoE models * Remove IQ4_XS for now, slightly adjust kvalues_iq4nl * AVX2 dot product uses Q8_0 instead of Q8_K * Add to test-backend-ops * Minor fix * Also use use Q5_K for attn_output in MoE models * Fixes after merging latest master * Switching to blocks of 32 * AVX2 for blocks of 32 * Scaler dot product for blocks of 32 * ARM_NEON dot product for blocks of 32 * Metal kernels for blocks of 32 * Slightly faster Metal kernels * Resurrecting iq3_xs After all the experimentation, nothing was better than this. * Minor PPL improvement via a block scale fudge factor * Minor improvement via 3 neighbours * iq3_xs: working scalar and AVX2 dot products * iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s) * iq3_xs: working Metal implementation * Adding IQ3_M - IQ3_XS mix with mostly Q4_K * iiq3_xs: a 3.4375 bpw variant * iq3_xs: make CUDA work for new version * iq3_xs: make scalar and AVX2 work for new version * iq3_s: make ARM_NEON work with new version * iq3_xs: make new version work on metal Performance is very similar to Q3_K_S * iq3_xs: tiny Metal speed improvement * iq3_xs: tiny Metal speed improvement * Fix stupid warning * Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS * iq3_xs: rename to iq3_s * iq3_s: make tests pass * Move Q3_K_XS mix to 3.25 bpw * Attempt to fix failing tests * Another attempt to fix the Windows builds * Attempt to fix ROCm * ROCm again * iq3_s: partial fix for QK_K = 64 * iq3_s: make it work on metal for QK_K = 64 Pleasent surprise: the coding was super-block size independent, so all it took was to delete some QK_K == 256 guards. * Will this fix ROCm? --------- Co-authored-by: Iwan Kawrakow --- examples/quantize/quantize.cpp | 2 + ggml-cuda.cu | 171 ++++++++- ggml-metal.m | 33 +- ggml-metal.metal | 304 +++++++++++++++ ggml-quants.c | 674 +++++++++++++++++++++++++++++---- ggml-quants.h | 20 + ggml.c | 31 ++ ggml.h | 2 + llama.cpp | 50 ++- llama.h | 2 + tests/test-backend-ops.cpp | 2 +- tests/test-quantize-fns.cpp | 4 +- 12 files changed, 1211 insertions(+), 84 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 37520857f99f7..ab7e72aaf8254 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -27,6 +27,8 @@ static const std::vector QUANT_OPTIONS = { { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, + { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, + { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b0e454e025ec4..21c612cb71b48 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -172,6 +172,7 @@ #endif typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsubss4(const int a, const int b) { const int8x4_t va = reinterpret_cast(a); const int8x4_t vb = reinterpret_cast(b); @@ -196,6 +197,18 @@ static __device__ __forceinline__ int __vsub4(const int a, const int b) { return __vsubss4(a, b); } +static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) c = __builtin_amdgcn_sdot4(a, b, c, false); @@ -518,6 +531,17 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +#define QR3_XS 8 +#define QI3_XS (QK_K / (4*QR3_XS)) +typedef struct { + half d; + uint8_t qs[QK_K/4]; + uint8_t qh[QK_K/32]; + uint8_t signs[QK_K/8]; + uint8_t scales[QK_K/64]; +} block_iq3_s; +static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 27*(QK_K/64), "wrong iq3_s block size/padding"); + #define QR1_S 8 #define QI1_S (QK_K / (4*QR1_S)) typedef struct { @@ -1700,6 +1724,74 @@ static const __device__ uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; +static const __device__ uint32_t iq3xs_grid[512] = { + 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, + 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, + 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, + 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, + 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, + 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, + 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, + 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, + 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, + 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, + 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, + 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, + 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, + 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, + 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, + 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, + 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, + 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, + 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, + 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, + 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, + 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, + 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, + 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, + 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, + 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, + 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, + 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, + 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, + 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, + 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, + 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, + 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, + 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, + 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, + 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, + 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, + 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, + 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, + 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, + 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, + 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, + 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, + 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, + 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, + 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, + 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, + 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, + 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, + 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, + 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, + 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, + 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, + 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, + 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, + 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, + 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, + 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, + 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, + 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, + 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, + 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, + 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, + 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +}; + + static const __device__ uint64_t iq1s_grid[512] = { 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, @@ -1973,6 +2065,32 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds } +template +static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq3_s * x = (const block_iq3_s *) vx; + + const int tid = threadIdx.x; +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f; + const uint8_t signs = x[i].signs[4*ib + il]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + template static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -4717,6 +4835,41 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( #endif } +// TODO: don't use lookup table for signs +static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq3_s * bq2 = (const block_iq3_s *) vbq; + + const int ib32 = iqs; + const uint8_t * qs = bq2->qs + 8*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); + const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); + uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); + uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); + const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); + const int grid_h = __vsub4(grid2[0] ^ signs1, signs1); + sumi = __dp4a(grid_l, *((int *)q8+0), sumi); + sumi = __dp4a(grid_h, *((int *)q8+1), sumi); + q8 += 8; + } + const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f; + return d * sumi; +#else + assert(false); + return 0.f; +#endif +#else + assert(false); + return 0.f; +#endif +} + + static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { #if QK_K == 256 @@ -6849,6 +7002,12 @@ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, dequantize_block_iq3_xxs<<>>(vx, y); } +template +static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq3_s<<>>(vx, y); +} + template static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; @@ -6904,6 +7063,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq1_s_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_cuda; case GGML_TYPE_F32: return convert_unary_cuda; default: @@ -6943,6 +7104,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq1_s_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -8688,6 +8851,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_RDNA2 ? 128 : 64; default: GGML_ASSERT(false); @@ -8713,6 +8877,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q6_K: return 64; @@ -8818,6 +8983,10 @@ static void ggml_cuda_op_mul_mat_vec_q( mul_mat_vec_q_cuda (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; default: GGML_ASSERT(false); break; @@ -11541,7 +11710,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons } ggml_type a_type = a->type; if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || - a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) { + a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S) { if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; } diff --git a/ggml-metal.m b/ggml-metal.m index 0d4aa43093739..ee584cfa71ce7 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -61,6 +61,7 @@ GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, @@ -85,6 +86,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, @@ -105,6 +107,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, @@ -122,6 +125,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, @@ -139,6 +143,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32, @@ -452,6 +457,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); @@ -476,6 +482,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); @@ -496,6 +503,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); @@ -513,6 +521,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); @@ -530,6 +539,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); @@ -1347,6 +1357,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); @@ -1483,6 +1494,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; } break; + case GGML_TYPE_IQ3_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline; + } break; case GGML_TYPE_IQ1_S: { nth0 = 4; @@ -1537,8 +1554,8 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_IQ3_XXS) { - const int mem_size = 256*4+128; + else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { + const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1640,6 +1657,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); @@ -1779,6 +1797,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; } break; + case GGML_TYPE_IQ3_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline; + } break; case GGML_TYPE_IQ1_S: { nth0 = 4; @@ -1849,8 +1873,8 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src2t == GGML_TYPE_IQ3_XXS) { - const int mem_size = 256*4+128; + else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) { + const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1900,6 +1924,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; diff --git a/ggml-metal.metal b/ggml-metal.metal index c223a981c246a..b3bf405391d3e 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2525,6 +2525,20 @@ typedef struct { } block_iq3_xxs; // 98 bytes / block for QK_K = 256, so 3.0625 bpw +// 3.4375 bpw +#if QK_K == 64 +#define IQ3S_N_SCALE 2 +#else +#define IQ3S_N_SCALE QK_K/64 +#endif +typedef struct { + half d; + uint8_t qs[QK_K/4]; + uint8_t qh[QK_K/32]; + uint8_t signs[QK_K/8]; + uint8_t scales[IQ3S_N_SCALE]; +} block_iq3_s; + typedef struct { half d; uint8_t qs[QK_K/8]; @@ -3795,6 +3809,73 @@ constexpr constant static uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; +constexpr constant static uint32_t iq3xs_grid[512] = { + 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, + 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, + 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, + 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, + 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, + 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, + 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, + 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, + 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, + 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, + 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, + 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, + 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, + 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, + 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, + 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, + 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, + 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, + 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, + 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, + 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, + 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, + 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, + 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, + 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, + 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, + 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, + 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, + 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, + 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, + 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, + 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, + 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, + 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, + 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, + 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, + 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, + 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, + 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, + 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, + 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, + 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, + 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, + 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, + 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, + 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, + 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, + 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, + 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, + 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, + 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, + 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, + 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, + 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, + 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, + 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, + 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, + 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, + 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, + 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, + 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, + 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, + 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, + 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +}; + #define NGRID_IQ1S 512 constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = { 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, @@ -4361,6 +4442,136 @@ kernel void kernel_mul_mv_iq3_xxs_f32( kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } +void kernel_mul_mv_iq3_s_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values; + { + int nval = 8; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) values[pos + i] = iq3xs_grid[pos + i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + + for (int i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq3_s * xr = x + ibl; + device const uint8_t * qs = xr->qs + 8 * ib; + device const uint8_t * qh = xr->qh + ib; + device const uint8_t * sc = xr->scales + (ib/2); + device const uint8_t * signs = xr->signs + 4 * ib; + device const half * dh = &xr->d; + + for (int row = 0; row < N_DST; row++) { + + const float db = dh[0]; + const float d = db * (0.5f + ((sc[0] >> 4*(ib%2)) & 0xf)); + + float2 sum = {0}; + for (int l = 0; l < 4; ++l) { + const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); + const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]); + sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]); + } + } + sumf[row] += d * (sum[0] + sum[1]); + + dh += nb*sizeof(block_iq3_s)/2; + qs += nb*sizeof(block_iq3_s); + qh += nb*sizeof(block_iq3_s); + sc += nb*sizeof(block_iq3_s); + signs += nb*sizeof(block_iq3_s); + } + + y4 += 32 * 32; + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f; + } + } +} + +[[host_name("kernel_mul_mv_iq3_s_f32")]] +kernel void kernel_mul_mv_iq3_s_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + void kernel_mul_mv_iq1_s_f32_impl( device const void * src0, device const float * src1, @@ -4952,6 +5163,31 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x } } +template +void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * qs = xb->qs + 8*ib32; + device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; + const uint8_t qh = xb->qh[ib32] >> 4*il; + const float dl = d * (0.5f + ((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * 0.5f; + constant uint8_t * grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+0] | ((qh << 8) & 256))); + constant uint8_t * grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+1] | ((qh << 7) & 256))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); + reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); + } + grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+2] | ((qh << 6) & 256))); + grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+3] | ((qh << 5) & 256))); + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); + reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); + } +} + template void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 @@ -5525,6 +5761,7 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; @@ -5566,6 +5803,7 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -5619,6 +5857,7 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; @@ -6589,6 +6828,71 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32( sgitg); } +[[host_name("kernel_mul_mv_id_iq3_s_f32")]] +kernel void kernel_mul_mv_id_iq3_s_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq3_s_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} + [[host_name("kernel_mul_mv_id_iq1_s_f32")]] kernel void kernel_mul_mv_id_iq1_s_f32( device const char * ids, diff --git a/ggml-quants.c b/ggml-quants.c index b15977f53e2f3..5c5f2ce1b9b87 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3505,6 +3505,73 @@ static const uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; +static const uint32_t iq3xs_grid[512] = { + 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, + 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, + 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, + 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, + 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, + 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, + 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, + 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, + 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, + 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, + 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, + 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, + 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, + 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, + 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, + 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, + 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, + 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, + 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, + 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, + 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, + 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, + 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, + 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, + 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, + 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, + 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, + 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, + 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, + 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, + 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, + 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, + 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, + 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, + 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, + 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, + 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, + 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, + 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, + 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, + 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, + 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, + 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, + 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, + 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, + 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, + 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, + 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, + 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, + 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, + 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, + 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, + 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, + 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, + 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, + 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, + 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, + 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, + 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, + 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, + 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, + 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, + 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, + 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +}; + #define NGRID_IQ2XXS 512 static const uint64_t iq1s_grid[NGRID_IQ2XXS] = { 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, @@ -3736,6 +3803,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y } } +// ====================== 3.3125 bpw (de)-quantization + +void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint8_t * signs = x[i].signs; + + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f; + const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); + } + y += 8; + } + qs += 8; + signs += 4; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); + } + y += 8; + } + qh += 2; + qs += 8; + signs += 4; + } + } +} + // ====================== 1.5625 bpw (de)-quantization void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) { @@ -8806,6 +8916,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r #endif +#if defined (__AVX2__) || defined (__ARM_NEON) static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, @@ -8840,6 +8951,7 @@ static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, }; +#endif void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); @@ -9327,6 +9439,202 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void #endif } +void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_s * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + + const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1); + const uint8x16_t mask2 = vld1q_u8(k_mask2); + + uint8x16x2_t vs; + ggml_int8x16x4_t q3s; + ggml_int8x16x4_t q8b; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)x[i].signs; + const int8_t * restrict q8 = y[i].qs; + int sumi1 = 0, sumi2 = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)], + iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]}; + const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)], + iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]}; + const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)], + iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]}; + const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)], + iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]}; + qs += 16; + + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16))); + vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vceqq_u8(vs.val[0], mask2); + vs.val[1] = vceqq_u8(vs.val[1], mask2); + + q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0])); + q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1])); + + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16))); + vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vceqq_u8(vs.val[0], mask2); + vs.val[1] = vceqq_u8(vs.val[1], mask2); + + signs += 4; + + q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0])); + q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1])); + + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); + sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf)); + sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4)); + } + sumf += d*(sumi1 + sumi2); + } + *s = 0.25f * sumf; + +#elif defined(__AVX2__) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); + const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)x[i].signs; + const int8_t * restrict q8 = y[i].qs; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)], + iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)], + iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)], + iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)], + iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)], + iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)], + iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)], + iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]); + qs += 8; + const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)], + iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)], + iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)], + iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)], + iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)], + iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)], + iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)], + iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]); + qs += 8; + + __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); + + aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); + + signs += 4; + + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; + const uint16_t ls2 = x[i].scales[ib32/2] >> 4; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.25f * hsum_float_8(accumf); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint8_t * restrict signs = x[i].signs; + const int8_t * restrict q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1; + const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls1; + sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls2; + } + sumf += d * bsum; + } + *s = 0.25f * sumf; +#endif +} + + #ifdef __AVX2__ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i ax = _mm256_sign_epi8(x, x); @@ -9523,6 +9831,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * float sumf = 0; for (int ib = 0; ib < nb; ib += 2) { + q4bits.val[0] = vld1q_u8(x[ib+0].qs); q4bits.val[1] = vld1q_u8(x[ib+1].qs); q8b.val[0] = vld1q_s8(y[ib+0].qs); @@ -10239,14 +10548,15 @@ typedef struct { uint16_t * neighbours; } iq3_entry_t; -static iq3_entry_t iq3_data[1] = { +static iq3_entry_t iq3_data[2] = { + {NULL, NULL, NULL}, {NULL, NULL, NULL}, }; static inline int iq3_data_index(int grid_size) { (void)grid_size; - GGML_ASSERT(grid_size == 256); - return 0; + GGML_ASSERT(grid_size == 256 || grid_size == 512); + return grid_size == 256 ? 0 : 1; } static int iq3_compare_func(const void * left, const void * right) { @@ -10278,9 +10588,44 @@ void iq3xs_init_impl(int grid_size) { 3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610, 3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992, }; + static const uint16_t kgrid_512[512] = { + 0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34, + 37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77, + 80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142, + 145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210, + 217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288, + 291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393, + 395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514, + 516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576, + 577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653, + 655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727, + 728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833, + 840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977, + 989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047, + 1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103, + 1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199, + 1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296, + 1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415, + 1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561, + 1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648, + 1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761, + 1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877, + 1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068, + 2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177, + 2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269, + 2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520, + 2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634, + 2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805, + 2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083, + 3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276, + 3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591, + 3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729, + 3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032, + }; + const int kmap_size = 4096; - const int nwant = 2; - const uint16_t * kgrid = kgrid_256; + const int nwant = grid_size == 256 ? 2 : 3; + const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512; uint32_t * kgrid_q3xs; int * kmap_q3xs; uint16_t * kneighbors_q3xs; @@ -10377,7 +10722,7 @@ void iq3xs_init_impl(int grid_size) { } void iq3xs_free_impl(int grid_size) { - GGML_ASSERT(grid_size == 256); + GGML_ASSERT(grid_size == 256 || grid_size == 512); const int gindex = iq3_data_index(grid_size); if (iq3_data[gindex].grid) { free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL; @@ -10410,9 +10755,10 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u return grid_index; } -static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { +static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n, + const float * restrict quant_weights) { - const int gindex = iq3_data_index(256); + const int gindex = iq3_data_index(grid_size); const uint32_t * kgrid_q3xs = iq3_data[gindex].grid; const int * kmap_q3xs = iq3_data[gindex].map; @@ -10426,9 +10772,23 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict const int kMaxQ = 8; - const int nbl = n/256; + const int nbl = n/QK_K; - block_iq3_xxs * y = vy; + ggml_fp16_t * dh; + uint8_t * qs; + int block_size; + if (grid_size == 256) { + block_iq3_xxs * y = vy; + dh = &y->d; + qs = y->qs; + block_size = sizeof(block_iq3_xxs); + } else { + block_iq3_s * y = vy; + dh = &y->d; + qs = y->qs; + block_size = sizeof(block_iq3_s); + } + int quant_size = block_size - sizeof(ggml_fp16_t); float scales[QK_K/32]; float weight[32]; @@ -10439,20 +10799,21 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict bool is_on_grid[8]; bool is_on_grid_aux[8]; uint8_t block_signs[8]; - uint8_t q3[3*(QK_K/8)]; + uint8_t q3[3*(QK_K/8)+QK_K/32]; uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4); + uint8_t * qh = q3 + 3*(QK_K/8); for (int ibl = 0; ibl < nbl; ++ibl) { - y[ibl].d = GGML_FP32_TO_FP16(0.f); - memset(q3, 0, 3*QK_K/8); + dh[0] = GGML_FP32_TO_FP16(0.f); + memset(q3, 0, 3*QK_K/8+QK_K/32); float max_scale = 0; const float * xbl = x + QK_K*ibl; float sumx2 = 0; for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; - float sigma2 = sumx2/QK_K; + float sigma2 = 2*sumx2/QK_K; for (int ib = 0; ib < QK_K/32; ++ib) { const float * xb = xbl + 32*ib; @@ -10570,7 +10931,13 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict printf("\n"); GGML_ASSERT(false); } - q3[8*ib+k] = grid_index; + if (grid_size == 256) { + q3[8*ib+k] = grid_index; + } else { + q3[8*ib+k] = grid_index & 255; + qh[ib] |= ((grid_index >> 8) << k); + } + } scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21); GGML_ASSERT(scale >= 0); @@ -10579,63 +10946,25 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict } if (!max_scale) { - memset(y[ibl].qs, 0, 3*QK_K/8); + memset(qs, 0, quant_size); + dh += block_size/sizeof(ggml_fp16_t); + qs += block_size; continue; } float d = max_scale/31; - y[ibl].d = GGML_FP32_TO_FP16(d); + dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor float id = 1/d; - float sumqx = 0, sumq2 = 0; for (int ib = 0; ib < QK_K/32; ++ib) { int l = nearest_int(0.5f*(id*scales[ib]-1)); l = MAX(0, MIN(15, l)); scales_and_signs[ib] |= ((uint32_t)l << 28); - if (false) { - const float * xb = xbl + 32*ib; - if (quant_weights) { - const float * qw = quant_weights + QK_K*ibl + 32*ib; - for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); - } else { - for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i]; - } - const float db = 0.25f * d * (1 + 2*l); - for (int k = 0; k < 8; ++k) { - const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2); - const float * xk = xb + 4*k; - const float * wk = weight + 4*k; - //const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]); - const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]); - float best_mse = 0; int best_index = q3[8*ib+k]; - for (int j = 0; j < 4; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - best_mse += wk[j] * diff * diff; - } - for (int idx = 0; idx < 256; ++idx) { - //grid = (const uint8_t *)(kgrid_q3xs + idx); - grid = (const uint8_t *)(iq3xxs_grid + idx); - float mse = 0; - for (int j = 0; j < 4; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - mse += wk[j] * diff * diff; - } - if (mse < best_mse) { - best_mse = mse; best_index = idx; - } - } - q3[8*ib+k] = best_index; - //grid = (const uint8_t *)(kgrid_q3xs + best_index); - grid = (const uint8_t *)(iq3xxs_grid + best_index); - for (int j = 0; j < 4; ++j) { - float q = db * grid[j] * signs[j]; - sumqx += wk[j] * q * xk[j]; - sumq2 += wk[j] * q * q; - } - } - if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2); - } } - memcpy(y[ibl].qs, q3, 3*QK_K/8); + memcpy(qs, q3, quant_size); + + dh += block_size/sizeof(ggml_fp16_t); + qs += block_size; + } } @@ -10645,7 +10974,7 @@ size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int nblock = n_per_row/QK_K; char * qrow = (char *)dst; for (int row = 0; row < nrow; ++row) { - quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights); + quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights); src += n_per_row; qrow += nblock*sizeof(block_iq3_xxs); } @@ -10660,9 +10989,226 @@ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) { void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) { assert(k % QK_K == 0); - quantize_row_iq3_xxs_impl(x, y, k, NULL); + quantize_row_iq3_xxs_impl(256, x, y, k, NULL); +} + +static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n, + const float * restrict quant_weights, + float * scales, + float * weight, + float * xval, + int8_t * L, + int8_t * Laux, + float * waux, + bool * is_on_grid, + bool * is_on_grid_aux, + uint8_t * block_signs) { + + const int gindex = iq3_data_index(512); + + const uint32_t * kgrid_q3xs = iq3_data[gindex].grid; + const int * kmap_q3xs = iq3_data[gindex].map; + const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours; + + //GGML_ASSERT(quant_weights && "missing quantization weights"); + GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(n%QK_K == 0); + + const int kMaxQ = 8; + + const int nbl = n/QK_K; + + block_iq3_s * y = vy; + + const int bs4 = block_size/4; + const int bs8 = block_size/8; + + for (int ibl = 0; ibl < nbl; ++ibl) { + + memset(&y[ibl], 0, sizeof(block_iq3_s)); + y[ibl].d = GGML_FP32_TO_FP16(0.f); + + uint8_t * qs = y[ibl].qs; + uint8_t * qh = y[ibl].qh; + uint8_t * signs = y[ibl].signs; + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = 2*sumx2/QK_K; + + for (int ib = 0; ib < QK_K/block_size; ++ib) { + const float * xb = xbl + block_size*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + block_size*ib; + for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i]; + } + for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]); + for (int k = 0; k < bs8; ++k) { + uint8_t s = 0; + for (int i = 0; i < 8; ++i) { + if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i]; + else { + xval[8*k + i] = -xb[8*k + i]; s |= (1 << i); + } + } + block_signs[k] = s; + } + float max = xval[0]; + for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]); + if (!max) { + scales[ib] = 0; + continue; + } + float best = 0; + float scale = max/(2*kMaxQ-1); + for (int is = -15; is <= 15; ++is) { + float id = (2*kMaxQ-1+is*0.2f)/max; + float this_scale = 1/id; + for (int k = 0; k < bs4; ++k) { + for (int i = 0; i < 4; ++i) { + int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); + Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l)); + } + uint16_t u = 0; + for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i); + int grid_index = kmap_q3xs[u]; + is_on_grid_aux[k] = true; + if (grid_index < 0) { + is_on_grid_aux[k] = false; + const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1; + grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < block_size; ++i) { + float w = weight[i]; + float q = 2*Laux[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + scale = sumqx/sumq2; best = scale*sumqx; + for (int i = 0; i < block_size; ++i) L[i] = Laux[i]; + for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k]; + } + } + int n_not_ongrid = 0; + for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid; + if (n_not_ongrid > 0 && scale > 0) { + float id = 1/scale; + for (int k = 0; k < bs4; ++k) { + if (is_on_grid[k]) continue; + uint16_t u = 0; + for (int i = 0; i < 4; ++i) { + int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); + l = MAX(0, MIN(kMaxQ-1, l)); + u |= (l << 3*i); + } + int grid_index = kmap_q3xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1; + grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k); + } + const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index); + for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2; + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < block_size; ++i) { + float w = weight[i]; + float q = 2*L[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0) scale = sumqx/sumq2; + } + if (scale < 0) { + // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale) + // and correspondingly flip quant signs. + scale = -scale; + for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k]; + } + for (int k = 0; k < bs4; ++k) { + uint16_t u = 0; + for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i); + int grid_index = kmap_q3xs[u]; + if (grid_index < 0) { + printf("Oops: found point %u not on grid:", u); + for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); + printf("\n"); + GGML_ASSERT(false); + } + qs[k] = grid_index & 255; + qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8)); + } + qs += bs4; + for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k]; + signs += bs8; + GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + continue; + } + + float d = max_scale/31; + y[ibl].d = GGML_FP32_TO_FP16(d); + float id = 1/d; + for (int ib = 0; ib < QK_K/block_size; ib += 2) { + int l1 = nearest_int(0.5f*(id*scales[ib+0]-1)); + l1 = MAX(0, MIN(15, l1)); + int l2 = nearest_int(0.5f*(id*scales[ib+1]-1)); + l2 = MAX(0, MIN(15, l2)); + y[ibl].scales[ib/2] = l1 | (l2 << 4); + } + + } +} + +#define IQ3S_BLOCK_SIZE 32 +size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + float scales[QK_K/IQ3S_BLOCK_SIZE]; + float weight[IQ3S_BLOCK_SIZE]; + float xval[IQ3S_BLOCK_SIZE]; + int8_t L[IQ3S_BLOCK_SIZE]; + int8_t Laux[IQ3S_BLOCK_SIZE]; + float waux[IQ3S_BLOCK_SIZE]; + bool is_on_grid[IQ3S_BLOCK_SIZE/4]; + bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4]; + uint8_t block_signs[IQ3S_BLOCK_SIZE/8]; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights, + scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs); + src += n_per_row; + qrow += nblock*sizeof(block_iq3_s); + } + return nrow * nblock * sizeof(block_iq3_s); +} + +void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) { + assert(k % QK_K == 0); + block_iq3_s * restrict y = vy; + quantize_row_iq3_s_reference(x, y, k); } +void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) { + assert(k % QK_K == 0); + quantize_iq3_s(x, y, 1, k, NULL, NULL); +} + + // =================================== 1.5 bpw =================================================== static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid, diff --git a/ggml-quants.h b/ggml-quants.h index 113623b62938a..303b0b6f9552e 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -191,6 +191,21 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +// 3.4375 bpw +#if QK_K == 64 +#define IQ3S_N_SCALE 2 +#else +#define IQ3S_N_SCALE QK_K/64 +#endif +typedef struct { + ggml_fp16_t d; + uint8_t qs[QK_K/4]; + uint8_t qh[QK_K/32]; + uint8_t signs[QK_K/8]; + uint8_t scales[IQ3S_N_SCALE]; +} block_iq3_s; +static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding"); + typedef struct { ggml_fp16_t d; uint8_t qs[QK_K/8]; @@ -226,6 +241,7 @@ void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGM void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k); +void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); @@ -242,6 +258,7 @@ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); @@ -262,6 +279,7 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -280,6 +298,7 @@ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") @@ -289,6 +308,7 @@ size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); diff --git a/ggml.c b/ggml.c index d710fe702ddbd..c09a3cad657f2 100644 --- a/ggml.c +++ b/ggml.c @@ -678,6 +678,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ3_S] = { + .type_name = "iq3_s", + .blck_size = QK_K, + .type_size = sizeof(block_iq3_s), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq3_s, + .from_float = quantize_row_iq3_s, + .from_float_reference = (ggml_from_float_t)quantize_row_iq3_s_reference, + .vec_dot = ggml_vec_dot_iq3_s_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_IQ1_S] = { .type_name = "iq1_s", .blck_size = QK_K, @@ -2304,6 +2316,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break; case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break; case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break; + case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7738,6 +7751,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -8017,6 +8031,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -8141,6 +8156,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: default: { GGML_ASSERT(false); @@ -11039,6 +11055,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: { ggml_compute_forward_out_prod_q_f32(params, dst); } break; @@ -11227,6 +11244,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: default: { GGML_ASSERT(false); @@ -11429,6 +11447,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -12129,6 +12148,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -12212,6 +12232,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -19463,6 +19484,7 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; + case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; default: // nothing break; } @@ -19737,6 +19759,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; + case GGML_TYPE_IQ3_S: + { + GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); + } break; case GGML_TYPE_IQ1_S: { GGML_ASSERT(start % QK_K == 0); diff --git a/ggml.h b/ggml.h index 37eff627928e8..a4166e1f7afd0 100644 --- a/ggml.h +++ b/ggml.h @@ -350,6 +350,7 @@ extern "C" { GGML_TYPE_IQ3_XXS = 18, GGML_TYPE_IQ1_S = 19, GGML_TYPE_IQ4_NL = 20, + GGML_TYPE_IQ3_S = 21, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -389,6 +390,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors }; // available tensor operations: diff --git a/llama.cpp b/llama.cpp index 37477e6ef3c44..1f6b6cff48987 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2545,6 +2545,7 @@ struct llama_model_loader { case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; + case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -2890,6 +2891,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; default: return "unknown, may not work"; } @@ -10544,6 +10547,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } @@ -10575,13 +10584,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty new_type = GGML_TYPE_Q8_0; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { - new_type = GGML_TYPE_Q2_K; + new_type = GGML_TYPE_IQ3_XXS; + } + } else if (name.find("attn_q.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { + new_type = GGML_TYPE_IQ3_XXS; } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { @@ -10592,6 +10605,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || + (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } @@ -10623,37 +10640,41 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (qs.model.hparams.n_expert == 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || - ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { new_type = GGML_TYPE_Q5_K; } } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; } } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { - new_type = GGML_TYPE_Q2_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { + new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { - new_type = GGML_TYPE_Q2_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { + new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_up; } @@ -10673,7 +10694,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || - new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -10688,6 +10709,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ1_S: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break; @@ -10719,7 +10741,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K_S: case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; - case LLAMA_FTYPE_MOSTLY_Q3_K_XS: + case LLAMA_FTYPE_MOSTLY_Q3_K_XS: quantized_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; @@ -10733,6 +10755,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break; + case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } diff --git a/llama.h b/llama.h index 84f196b3bb625..889edf4d97b96 100644 --- a/llama.h +++ b/llama.h @@ -102,6 +102,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 55db42bf6e851..f8574588bcf2f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1918,7 +1918,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, - GGML_TYPE_IQ4_NL, + GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, }; // unary ops diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 5e92d5742a3cc..04656bb9e8e83 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -151,6 +151,7 @@ int main(int argc, char * argv[]) { const float max_quantization_error = type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : + type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR; failed = !(total_error < max_quantization_error); num_failed += failed; @@ -167,7 +168,8 @@ int main(int argc, char * argv[]) { const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data()); const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || - type == GGML_TYPE_IQ3_XXS ? MAX_DOT_PRODUCT_ERROR_LOWBIT : MAX_DOT_PRODUCT_ERROR; + type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT + : MAX_DOT_PRODUCT_ERROR; failed = !(vec_dot_error < max_allowed_error); num_failed += failed; if (failed || verbose) { From 9e359a4f47c1b2dceb99e29706c9f7403d32ab5e Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 24 Feb 2024 19:16:04 +0100 Subject: [PATCH 38/51] server: continue to update other slots on embedding concurrent request (#5699) * server: #5655 - continue to update other slots on embedding concurrent request. * server: tests: add multi users embeddings as fixed * server: tests: adding OAI compatible embedding concurrent endpoint * server: tests: adding OAI compatible embedding with multiple inputs --- examples/server/server.cpp | 2 +- examples/server/tests/features/issues.feature | 34 +--- .../server/tests/features/parallel.feature | 46 ++++++ examples/server/tests/features/server.feature | 13 ++ examples/server/tests/features/steps/steps.py | 151 +++++++++++++----- 5 files changed, 168 insertions(+), 78 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9fb436c2a18ec..19a8c1067e72a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1836,7 +1836,7 @@ struct llama_server_context send_embedding(slot); slot.release(); slot.i_batch = -1; - return true; + continue; } completion_token_output result; diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature index 542006d9a8df2..bf5a175a357ca 100644 --- a/examples/server/tests/features/issues.feature +++ b/examples/server/tests/features/issues.feature @@ -1,36 +1,4 @@ # List of ongoing issues @bug Feature: Issues - # Issue #5655 - Scenario: Multi users embeddings - Given a server listening on localhost:8080 - And a model file stories260K.gguf - And a model alias tinyllama-2 - And 42 as server seed - And 64 KV cache size - And 2 slots - And continuous batching - And embeddings extraction - Then the server is starting - Then the server is healthy - - Given a prompt: - """ - Write a very long story about AI. - """ - And a prompt: - """ - Write another very long music lyrics. - """ - And a prompt: - """ - Write a very long poem. - """ - And a prompt: - """ - Write a very long joke. - """ - Given concurrent embedding requests - Then the server is busy - Then the server is idle - Then all embeddings are generated + # No confirmed issue at the moment diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 802d624ffc9a3..c85f9de1d9a52 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -8,6 +8,7 @@ Feature: Parallel And 42 as server seed And 64 KV cache size And 2 slots + And embeddings extraction And continuous batching Then the server is starting Then the server is healthy @@ -75,3 +76,48 @@ Feature: Parallel Then the server is busy Then the server is idle Then all prompts are predicted + + Scenario: Multi users embeddings + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + Given concurrent embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated + + Scenario: Multi users OAI compatibility embeddings + Given a prompt: + """ + In which country Paris is located ? + """ + And a prompt: + """ + Is Madrid the capital of Spain ? + """ + And a prompt: + """ + What is the biggest US city ? + """ + And a prompt: + """ + What is the capital of Bulgaria ? + """ + And a model tinyllama-2 + Given concurrent OAI embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index fedcfe5aef1b3..5f81d256a548c 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -60,6 +60,19 @@ Feature: llama.cpp server """ Then embeddings are generated + Scenario: OAI Embeddings compatibility with multiple inputs + Given a model tinyllama-2 + Given a prompt: + """ + In which country Paris is located ? + """ + And a prompt: + """ + Is Madrid the capital of Spain ? + """ + When an OAI compatible embeddings computation request for multiple inputs + Then embeddings are generated + Scenario: Tokenize / Detokenize When tokenizing: diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 50f2b641e764e..9c825fdbcd7f5 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,4 +1,5 @@ import asyncio +import collections import json import os import re @@ -261,35 +262,35 @@ def step_a_prompt_prompt(context, prompt): @step(u'concurrent completion requests') @async_run_until_complete() async def step_concurrent_completion_requests(context): - await concurrent_completion_requests(context, - request_completion, - # prompt is inserted automatically - context.base_url, - debug=context.debug, - n_predict=context.n_predict if hasattr(context, 'n_predict') else None, - server_seed=context.server_seed if hasattr(context, 'server_seed') else None, - user_api_key=context.user_api_key if hasattr(context, - 'user_api_key') else None) + await concurrent_requests(context, + request_completion, + # prompt is inserted automatically + context.base_url, + debug=context.debug, + n_predict=context.n_predict if hasattr(context, 'n_predict') else None, + server_seed=context.server_seed if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key if hasattr(context, + 'user_api_key') else None) @step(u'concurrent OAI completions requests') @async_run_until_complete async def step_oai_chat_completions(context): - await concurrent_completion_requests(context, oai_chat_completions, - # user_prompt is inserted automatically - context.system_prompt, - context.base_url, - True, # async_client - model=context.model - if hasattr(context, 'model') else None, - n_predict=context.n_predict - if hasattr(context, 'n_predict') else None, - enable_streaming=context.enable_streaming - if hasattr(context, 'enable_streaming') else None, - server_seed=context.server_seed - if hasattr(context, 'server_seed') else None, - user_api_key=context.user_api_key - if hasattr(context, 'user_api_key') else None) + await concurrent_requests(context, oai_chat_completions, + # user_prompt is inserted automatically + context.system_prompt, + context.base_url, + True, # async_client + model=context.model + if hasattr(context, 'model') else None, + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + server_seed=context.server_seed + if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None) @step(u'all prompts are predicted') @@ -316,36 +317,58 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None): @step(u'embeddings are computed for') @async_run_until_complete async def step_compute_embedding(context): - content = context.text - base_url = context.base_url - context.embeddings = await request_embedding(content, base_url) + context.embeddings = await request_embedding(context.text, base_url=context.base_url) @step(u'embeddings are generated') def step_assert_embeddings(context): - assert_embeddings(context.embeddings) + if len(context.prompts) == 0: + assert_embeddings(context.embeddings) + else: + assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n" + f"context.prompts={context.prompts}\n" + f"context.embeddings={context.embeddings}") + for embedding in context.embeddings: + context.prompts.pop() + assert_embeddings(embedding) @step(u'an OAI compatible embeddings computation request for') -def step_oai_compute_embedding(context): - openai.api_key = 'nope' # openai client always expects an api_keu - if context.user_api_key is not None: - openai.api_key = context.user_api_key - openai.api_base = f'{context.base_url}/v1' - embeddings = openai.Embedding.create( - model=context.model, - input=context.text, - ) - context.embeddings = embeddings +@async_run_until_complete +async def step_oai_compute_embeddings(context): + context.embeddings = await request_oai_embeddings(context.text, + base_url=context.base_url, + user_api_key=context.user_api_key, + model=context.model) + + +@step(u'an OAI compatible embeddings computation request for multiple inputs') +@async_run_until_complete +async def step_oai_compute_embeddings_multiple_inputs(context): + context.embeddings = await request_oai_embeddings(context.prompts, + base_url=context.base_url, + user_api_key=context.user_api_key, + model=context.model) @step(u'concurrent embedding requests') @async_run_until_complete() async def step_concurrent_embedding_requests(context): - await concurrent_completion_requests(context, - request_embedding, - # prompt is inserted automatically - context.base_url) + await concurrent_requests(context, + request_embedding, + # prompt is inserted automatically + base_url=context.base_url) + + +@step(u'concurrent OAI embedding requests') +@async_run_until_complete() +async def step_concurrent_oai_embedding_requests(context): + await concurrent_requests(context, + request_oai_embeddings, + # prompt is inserted automatically + base_url=context.base_url, + async_client=True, + model=context.model) @step(u'all embeddings are generated') @@ -401,7 +424,7 @@ def step_check_options_header_value(context, cors_header, cors_header_value): assert context.options_response.headers[cors_header] == cors_header_value -async def concurrent_completion_requests(context, f_completion, *args, **kwargs): +async def concurrent_requests(context, f_completion, *args, **kwargs): n_prompts = len(context.prompts) if context.debug: print(f"starting {n_prompts} concurrent completion requests...") @@ -565,7 +588,7 @@ async def oai_chat_completions(user_prompt, return completion_response -async def request_embedding(content, base_url): +async def request_embedding(content, base_url=None): async with aiohttp.ClientSession() as session: async with session.post(f'{base_url}/embedding', json={ @@ -576,6 +599,46 @@ async def request_embedding(content, base_url): return response_json['embedding'] +async def request_oai_embeddings(input, + base_url=None, user_api_key=None, + model=None, async_client=False): + # openai client always expects an api_key + user_api_key = user_api_key if user_api_key is not None else 'nope' + if async_client: + origin = 'llama.cpp' + if user_api_key is not None: + headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/v1/embeddings', + json={ + "input": input, + "model": model, + }, + headers=headers) as response: + assert response.status == 200, f"received status code not expected: {response.status}" + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "application/json; charset=utf-8" + response_json = await response.json() + assert response_json['model'] == model, f"invalid model received: {response_json['model']}" + assert response_json['object'] == 'list' + return response_json['data'] + else: + openai.api_key = user_api_key + openai.api_base = f'{base_url}/v1' + oai_embeddings = openai.Embedding.create( + model=model, + input=input, + ) + + if isinstance(input, collections.abc.Sequence): + embeddings = [] + for an_oai_embeddings in oai_embeddings.data: + embeddings.append(an_oai_embeddings.embedding) + else: + embeddings = oai_embeddings.data.embedding + return embeddings + + def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): content = completion_response['content'] n_predicted = completion_response['timings']['predicted_n'] From 69917dfa55674c608360638bb4d6a12a315e2810 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Sun, 25 Feb 2024 10:54:04 +0100 Subject: [PATCH 39/51] py : fix StableLM conversion after config.json changes (#5703) * Fix issues during StableLM models conversion * Fix hard coded layer_norm_eps * Support layer_norm_eps for LlavaStableLM Co-authored-by: Jared Van Bortel * Add missing parenthesis Co-authored-by: Jared Van Bortel * Support rotary_factor for LlavaStableLM Co-authored-by: Jared Van Bortel * fix typo * Add StableLMEpochForCausalLM for safety Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> * Add StableLMEpochForCausalLM for safety 2 Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> --------- Co-authored-by: Jared Van Bortel Co-authored-by: Jared Van Bortel Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> --- convert-hf-to-gguf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 32d54b45f3325..ae30b2a76971a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -192,7 +192,7 @@ def from_model_architecture(model_architecture): return RefactModel if model_architecture == "PersimmonForCausalLM": return PersimmonModel - if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return StableLMModel if model_architecture == "QWenLMHeadModel": return QwenModel @@ -253,7 +253,7 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: return gguf.MODEL_ARCH.REFACT if arch == "PersimmonForCausalLM": return gguf.MODEL_ARCH.PERSIMMON - if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return gguf.MODEL_ARCH.STABLELM if arch == "QWenLMHeadModel": return gguf.MODEL_ARCH.QWEN @@ -1074,10 +1074,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(1e-5) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) class MixtralModel(Model): From ab336a9d5e5352ecdcdf4c12d2d54cf4ef82ce31 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 12:09:09 +0200 Subject: [PATCH 40/51] code : normalize enum names (#5697) * coda : normalize enum names ggml-ci * code : cont * code : cont --- common/common.cpp | 18 +- common/common.h | 4 +- common/train.cpp | 10 +- examples/baby-llama/baby-llama.cpp | 2 +- examples/finetune/finetune.cpp | 2 +- examples/llama-bench/llama-bench.cpp | 14 +- examples/llava/llava.cpp | 2 +- examples/server/server.cpp | 18 +- .../train-text-from-scratch.cpp | 2 +- ggml-cuda.cu | 138 +++---- ggml-metal.m | 4 +- ggml-opencl.cpp | 50 +-- ggml-sycl.cpp | 152 ++++---- ggml-vulkan.cpp | 102 ++--- ggml.c | 350 +++++++++--------- ggml.h | 38 +- llama.cpp | 64 ++-- llama.h | 28 +- tests/test-backend-ops.cpp | 4 +- tests/test-opt.cpp | 2 +- 20 files changed, 502 insertions(+), 502 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 10ef11829cc50..ec596f5a075de 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { @@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; @@ -837,15 +837,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { sep++; if (strncmp(sep, "int:", 4) == 0) { sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_INT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; kvo.int_value = std::atol(sep); } else if (strncmp(sep, "float:", 6) == 0) { sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; kvo.float_value = std::atof(sep); } else if (strncmp(sep, "bool:", 5) == 0) { sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_BOOL; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; if (std::strcmp(sep, "true") == 0) { kvo.bool_value = true; } else if (std::strcmp(sep, "false") == 0) { diff --git a/common/common.h b/common/common.h index 935771d44ca9c..3e21579b00545 100644 --- a/common/common.h +++ b/common/common.h @@ -61,7 +61,7 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs + llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. @@ -75,7 +75,7 @@ struct gpt_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; // // sampling parameters diff --git a/common/train.cpp b/common/train.cpp index e4c3d5df61818..0dbfd24df2314 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -31,7 +31,7 @@ struct train_state * init_train_state() { state->opt = new struct ggml_opt_context; state->opt->ctx = NULL; - state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; state->opt->loss_after = 0.0f; @@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g std::string opt_type; GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { - opt->params.type = GGML_OPT_ADAM; + opt->params.type = GGML_OPT_TYPE_ADAM; GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); @@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_LBFGS; + opt->params.type = GGML_OPT_TYPE_LBFGS; GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); @@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); @@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_add_tensor(fctx, opt->adam.pf); } } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 65bb238a0d565..bf0125e753746 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1547,7 +1547,7 @@ int main(int argc, char ** argv) { float error_before_opt = ggml_get_f32_1d(e, 0); - struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); + struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS); opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; opt_params_lbfgs.lbfgs.n_iter = 16; diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 98bf5a07a7ed1..3da5317b3d910 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) { lora.hparams.n_rank_output = n_rank_output; // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 11410f8ae7625..8fec3d43ddfdd 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) { static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_NONE: return "none"; - case LLAMA_SPLIT_LAYER: return "layer"; - case LLAMA_SPLIT_ROW: return "row"; + case LLAMA_SPLIT_MODE_NONE: return "none"; + case LLAMA_SPLIT_MODE_LAYER: return "layer"; + case LLAMA_SPLIT_MODE_ROW: return "row"; default: GGML_ASSERT(!"invalid split mode"); } } @@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = { /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, - /* split_mode */ {LLAMA_SPLIT_LAYER}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, @@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { for (const auto & m : p) { llama_split_mode mode; if (m == "none") { - mode = LLAMA_SPLIT_NONE; + mode = LLAMA_SPLIT_MODE_NONE; } else if (m == "layer") { - mode = LLAMA_SPLIT_LAYER; + mode = LLAMA_SPLIT_MODE_LAYER; } else if (m == "row") { - mode = LLAMA_SPLIT_ROW; + mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 1a1cf7c78bf34..9801281661b25 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -152,7 +152,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); - if (newline_tmp->backend != GGML_BACKEND_CPU) { + if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) { if (newline_tmp->buffer == NULL) { printf("newline_tmp tensor buffer is NULL\n"); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 19a8c1067e72a..780862ef67810 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2086,9 +2086,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-freq-base") @@ -2212,15 +2212,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; @@ -2447,15 +2447,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, sep++; if (strncmp(sep, "int:", 4) == 0) { sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_INT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; kvo.int_value = std::atol(sep); } else if (strncmp(sep, "float:", 6) == 0) { sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; kvo.float_value = std::atof(sep); } else if (strncmp(sep, "bool:", 5) == 0) { sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_BOOL; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; if (std::strcmp(sep, "true") == 0) { kvo.bool_value = true; } else if (std::strcmp(sep, "false") == 0) { diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index e78ab185d89f3..7eafe8515a943 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -960,7 +960,7 @@ int main(int argc, char ** argv) { struct ggml_opt_context * opt = train->opt; // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 21c612cb71b48..fb6d4f7d215b6 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6369,11 +6369,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n int ixj = col ^ j; if (ixj > col) { if ((col & k) == 0) { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } else { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } @@ -7927,10 +7927,10 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co const dim3 block_dims(ncols, 1, 1); const dim3 block_nums(1, nrows, 1); - if (order == GGML_SORT_ASC) { - k_argsort_f32_i32<<>>(x, dst, ncols); - } else if (order == GGML_SORT_DESC) { - k_argsort_f32_i32<<>>(x, dst, ncols); + if (order == GGML_SORT_ORDER_ASC) { + k_argsort_f32_i32<<>>(x, dst, ncols); + } else if (order == GGML_SORT_ORDER_DESC) { + k_argsort_f32_i32<<>>(x, dst, ncols); } else { GGML_ASSERT(false); } @@ -8362,11 +8362,11 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( cudaMemcpyKind kind; char * src_ptr; - if (src->backend == GGML_BACKEND_CPU) { + if (src->backend == GGML_BACKEND_TYPE_CPU) { kind = cudaMemcpyHostToDevice; src_ptr = (char *) src->data; - } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { - GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = cudaMemcpyDeviceToDevice; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; @@ -8771,7 +8771,7 @@ static void ggml_cuda_op_mul_mat_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -8920,7 +8920,7 @@ static void ggml_cuda_op_mul_mat_vec_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -9096,7 +9096,7 @@ static void ggml_cuda_op_mul_mat_cublas( // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; const int compute_capability = g_device_caps[id].cc; @@ -9444,7 +9444,7 @@ static void ggml_cuda_op_soft_max( const bool use_src2 = src2 != nullptr; if (use_src2) { - const bool src2_on_device = src2->backend == GGML_BACKEND_GPU; + const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU; if (src2_on_device) { ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; @@ -9502,16 +9502,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s const bool use_src1 = src1 != nullptr; const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; // dd = data device float * src0_ddf = nullptr; @@ -9555,7 +9555,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { CUDA_CHECK(cudaDeviceSynchronize()); } } @@ -9636,8 +9636,8 @@ static void ggml_cuda_op_mul_mat( const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -9653,20 +9653,20 @@ static void ggml_cuda_op_mul_mat( ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); std::array tensor_split; if (split) { - // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check + // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...); ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; tensor_split = buft_ctx->tensor_split; @@ -9724,8 +9724,8 @@ static void ggml_cuda_op_mul_mat( used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; ggml_cuda_set_device(id); cudaStream_t stream = g_cudaStreams[id][0]; @@ -9776,8 +9776,8 @@ static void ggml_cuda_op_mul_mat( continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; const int64_t row_diff = dev[id].row_high - dev[id].row_low; ggml_cuda_set_device(id); @@ -9802,12 +9802,12 @@ static void ggml_cuda_op_mul_mat( // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) { + if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) { dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { if (id != g_main_device) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset; @@ -9820,14 +9820,14 @@ static void ggml_cuda_op_mul_mat( src1_ncols*ne10*sizeof(float), stream)); } } - } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) { CUDA_CHECK(ggml_cuda_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) { quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } @@ -9845,10 +9845,10 @@ static void ggml_cuda_op_mul_mat( if (!dst_on_device) { void * dst_off_device; cudaMemcpyKind kind; - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { dst_off_device = dst->data; kind = cudaMemcpyDeviceToHost; - } else if (dst->backend == GGML_BACKEND_GPU) { + } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { dst_off_device = dst_extra->data_device[g_main_device]; kind = cudaMemcpyDeviceToDevice; } else { @@ -9913,7 +9913,7 @@ static void ggml_cuda_op_mul_mat( } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { ggml_cuda_set_device(g_main_device); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -10019,7 +10019,7 @@ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const stru static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -10050,7 +10050,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -10109,7 +10109,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_TENSOR_BINARY_OP_LOCALS @@ -10255,11 +10255,11 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool all_on_device = - (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && - (src1->backend == GGML_BACKEND_GPU) && - ( dst->backend == GGML_BACKEND_GPU); + (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && + (src1->backend == GGML_BACKEND_TYPE_GPU) && + ( dst->backend == GGML_BACKEND_TYPE_GPU); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; @@ -10409,7 +10409,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src00)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32); const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00); @@ -10553,7 +10553,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s cudaStream_t stream = g_cudaStreams[g_main_device][0]; - if (ids->backend == GGML_BACKEND_GPU) { + if (ids->backend == GGML_BACKEND_TYPE_GPU) { const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -10570,20 +10570,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.backend = GGML_BACKEND_GPU; - dst_row.backend = GGML_BACKEND_GPU; + src1_row.backend = GGML_BACKEND_TYPE_GPU; + dst_row.backend = GGML_BACKEND_TYPE_GPU; src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_CPU ? + char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; - char * dst_original = dst->backend == GGML_BACKEND_CPU ? + char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { //int32_t row_id; @@ -10611,9 +10611,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); - const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ? + const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; - const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ? + const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ? cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice; for (int32_t row_id = 0; row_id < n_as; ++row_id) { @@ -10668,7 +10668,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { CUDA_CHECK(cudaStreamSynchronize(stream)); } } @@ -10685,8 +10685,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); @@ -10817,9 +10817,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st if (!g_cublas_loaded) return false; ggml_cuda_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { return false; @@ -10966,14 +10966,14 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st return false; } - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); } if (params->ith != 0) { return true; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return true; } func(tensor->src[0], tensor->src[1], tensor); @@ -11072,7 +11072,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t extra->data_device[ctx->device] = tensor->data; - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; if (ggml_is_quantized(tensor->type)) { @@ -11087,7 +11087,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t } GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -11098,7 +11098,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t } GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -11333,7 +11333,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); } } - tensor->backend = GGML_BACKEND_GPU_SPLIT; + tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; tensor->extra = extra; } @@ -11605,7 +11605,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); } @@ -11614,7 +11614,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } @@ -11644,7 +11644,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg ggml_cuda_set_main_device(cuda_ctx->device); ggml_compute_params params = {}; - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -11654,13 +11654,13 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg } #ifndef NDEBUG - assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT); + assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT); + assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); assert(node->src[j]->extra != nullptr); } diff --git a/ggml-metal.m b/ggml-metal.m index ee584cfa71ce7..3d6b01263acb5 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2262,8 +2262,8 @@ static bool ggml_metal_graph_compute( id pipeline = nil; switch (order) { - case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; - case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; + case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; + case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; default: GGML_ASSERT(false); }; diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 797bee66799b5..df619a884842c 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) { } void ggml_cl_free_data(const struct ggml_tensor* tensor) { - if (tensor->backend != GGML_BACKEND_GPU) { + if (tensor->backend != GGML_BACKEND_TYPE_GPU) { return; } @@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o } static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src } static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr size_t y_size; size_t d_size; cl_mem d_X; - if (src0->backend == GGML_BACKEND_GPU) { // NOLINT + if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT d_X = (cl_mem) src0->extra; } else { d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); } - cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); + cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); + cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); size_t x_offset = 0; @@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr // TODO: copy src0 here when r3>1 for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->backend == GGML_BACKEND_GPU) { + if (src0->backend == GGML_BACKEND_TYPE_GPU) { x_offset = (i03 * ne02 + i02) * x_ne; } else { // copy src0 to device @@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // copy src1 to device - if (src1->backend == GGML_BACKEND_CPU) { + if (src1->backend == GGML_BACKEND_TYPE_CPU) { CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); } @@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); } @@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } } - if (src0->backend != GGML_BACKEND_GPU) { + if (src0->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_X, x_size); } - if (src1->backend != GGML_BACKEND_GPU) { + if (src1->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_Y, y_size); } - if (dst->backend != GGML_BACKEND_GPU) { + if (dst->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_D, d_size); } } @@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr size_t y_size; size_t d_size; cl_mem d_X; - if (src0->backend == GGML_BACKEND_GPU) { // NOLINT + if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT d_X = (cl_mem) src0->extra; } else { d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); @@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr // TODO: copy src0 here when r3>1 for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->backend == GGML_BACKEND_GPU) { + if (src0->backend == GGML_BACKEND_TYPE_GPU) { x_offset = (i03 * ne02 + i02) * x_ne; } else { // copy src0 to device @@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host, then convert to float - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); ggml_fp16_to_fp32_row(tmp, d, d_ne); @@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } } - if (src0->backend != GGML_BACKEND_GPU) { + if (src0->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_X, x_size); } ggml_cl_pool_free(d_Y, y_size); @@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); cl_mem d_Q; - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { d_Q = ggml_cl_pool_malloc(q_sz, &q_size); } @@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy src0 to device if necessary - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { events.emplace_back(); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); - } else if (src0->backend == GGML_BACKEND_GPU) { + } else if (src0->backend == GGML_BACKEND_TYPE_GPU) { d_Q = (cl_mem) src0->extra; } else { GGML_ASSERT(false); @@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * if (!mul_mat_vec) { // convert src0 to fp32 on device const size_t global = x_ne / global_denom; - const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; + const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0; CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); @@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * // compute const size_t global = ne01 * local; - const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; + const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0; const cl_int ncols = ne00; events.emplace_back(); CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); @@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } ggml_cl_pool_free(d_Y, y_size); ggml_cl_pool_free(d_D, d_size); - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { ggml_cl_pool_free(d_Q, q_size); } } @@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) { + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) { return true; } @@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { CL_CHECK(clFinish(queue)); tensor->extra = dst; - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); } // ggml-backend @@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ctx->sub_buffers.push_back(sub_buffer); tensor->extra = sub_buffer; } - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; } static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index b897828f967b1..c6c3c6e6fef07 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){ size_t total_elements = ggml_nelements(src); - const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT; + const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT; float *src_data =NULL; if(src_on_device) { ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; @@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ixj = col ^ j; if (ixj > col) { if ((col & k) == 0) { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } else { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } @@ -10825,7 +10825,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, const sycl::range<3> block_dims(1, 1, ncols); const sycl::range<3> block_nums(1, nrows, 1); - if (order == GGML_SORT_ASC) { + if (order == GGML_SORT_ORDER_ASC) { /* DPCT1049:44: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query @@ -10834,9 +10834,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); + k_argsort_f32_i32(x, dst, ncols, item_ct1); }); - } else if (order == GGML_SORT_DESC) { + } else if (order == GGML_SORT_ORDER_DESC) { /* DPCT1049:45: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query @@ -10845,7 +10845,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); + k_argsort_f32_i32(x, dst, ncols, item_ct1); }); } else { GGML_ASSERT(false); @@ -11407,12 +11407,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, dpct::memcpy_direction kind; char * src_ptr; - if (src->backend == GGML_BACKEND_CPU) { + if (src->backend == GGML_BACKEND_TYPE_CPU) { kind = dpct::host_to_device; src_ptr = (char *) src->data; - // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr); - } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { - GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr); + } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = dpct::device_to_device; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; @@ -11846,7 +11846,7 @@ inline void ggml_sycl_op_mul_mat_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -12119,7 +12119,7 @@ inline void ggml_sycl_op_mul_mat_sycl( // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; + int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff; #ifdef GGML_SYCL_F16 bool use_fp16 = true; // TODO(Yu) SYCL capability check @@ -12501,16 +12501,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, const bool use_src1 = src1 != nullptr; const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; // dd = data device float * src0_ddf = nullptr; @@ -12565,7 +12565,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst)))); } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { SYCL_CHECK(CHECK_TRY_ERROR( dpct::get_current_device().queues_wait_and_throw())); } @@ -12640,8 +12640,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -12656,13 +12656,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); @@ -12717,8 +12717,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; ggml_sycl_set_device(get_device_id_by_index(id)); const dpct::queue_ptr stream = g_syclStreams[id][0]; @@ -12782,8 +12782,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; const int64_t row_diff = row_high[id] - row_low[id]; ggml_sycl_set_device(get_device_id_by_index(id)); @@ -12809,12 +12809,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) { + if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) { dst_dd_i += row_low[id]; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { if (id != g_main_device_index) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset; @@ -12830,14 +12830,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, src1_ncols * ne10 * sizeof(float)))); } } - } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) { SYCL_CHECK(ggml_sycl_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) { quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); /* DPCT1010:92: SYCL uses exceptions to report errors and does @@ -12867,10 +12867,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, if (!dst_on_device) { void * dst_off_device; dpct::memcpy_direction kind; - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { dst_off_device = dst->data; kind = dpct::device_to_host; - } else if (dst->backend == GGML_BACKEND_GPU) { + } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { dst_off_device = dst_extra->data_device[g_main_device_index]; kind = dpct::device_to_device; } else { @@ -12954,7 +12954,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { SYCL_CHECK(ggml_sycl_set_device(g_main_device)); SYCL_CHECK(CHECK_TRY_ERROR( dpct::get_current_device().queues_wait_and_throw())); @@ -13091,7 +13091,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) try { GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -13129,7 +13129,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0, GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13196,7 +13196,7 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13372,11 +13372,11 @@ catch (sycl::exception const &exc) { static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool all_on_device = - (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && - (src1->backend == GGML_BACKEND_GPU) && - ( dst->backend == GGML_BACKEND_GPU); + (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && + (src1->backend == GGML_BACKEND_TYPE_GPU) && + ( dst->backend == GGML_BACKEND_TYPE_GPU); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; for (int64_t id = 0; id < g_device_count; ++id) { @@ -13505,7 +13505,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src00)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne); @@ -13643,7 +13643,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; - if (ids->backend == GGML_BACKEND_GPU) { + if (ids->backend == GGML_BACKEND_TYPE_GPU) { const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index]; SYCL_CHECK(CHECK_TRY_ERROR( stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); @@ -13661,20 +13661,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.backend = GGML_BACKEND_GPU; - dst_row.backend = GGML_BACKEND_GPU; + src1_row.backend = GGML_BACKEND_TYPE_GPU; + dst_row.backend = GGML_BACKEND_TYPE_GPU; src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_CPU ? + char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index]; - char * dst_original = dst->backend == GGML_BACKEND_CPU ? + char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { //int32_t row_id; @@ -13756,7 +13756,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); } } @@ -13779,8 +13779,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); @@ -13887,17 +13887,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try { memset(extra, 0, sizeof(*extra)); for (int64_t id = 0; id < g_device_count; ++id) { - if (backend == GGML_BACKEND_GPU && id != g_main_device_index) { + if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) { continue; } ggml_sycl_set_device(get_device_id_by_index(id)); const dpct::queue_ptr stream = g_syclStreams[id][0]; int64_t row_low, row_high; - if (backend == GGML_BACKEND_GPU) { + if (backend == GGML_BACKEND_TYPE_GPU) { row_low = 0; row_high = nrows; - } else if (backend == GGML_BACKEND_GPU_SPLIT) { + } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) { const int64_t rounding = get_row_rounding(tensor->type); row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; @@ -13946,7 +13946,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try { extra->data_device[id] = buf; - if (backend == GGML_BACKEND_GPU_SPLIT) { + if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) { for (int64_t is = 0; is < MAX_STREAMS; ++is) { SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] = new sycl::event())); @@ -13963,7 +13963,7 @@ catch (sycl::exception const &exc) { } void ggml_sycl_free_data(struct ggml_tensor *tensor) try { - if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { + if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) { return; } @@ -14016,15 +14016,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor, return; } - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) { const ggml_op src0_op = tensor->src[0]->op; if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); } } - if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { + if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) { ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); } @@ -14042,7 +14042,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor, SYCL_CHECK(ggml_sycl_set_device(g_main_device)); const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; - if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { + if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) { ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; size_t offset = 0; @@ -14111,7 +14111,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor, const bool inplace = tensor->view_src != nullptr; - if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { + if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) { ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; size_t view_offset = 0; @@ -14132,7 +14132,7 @@ catch (sycl::exception const &exc) { } void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(ggml_is_contiguous(tensor)); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -14219,9 +14219,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ if (!g_sycl_loaded) return false; ggml_sycl_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { return false; @@ -14359,14 +14359,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ return false; } - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { ggml_sycl_set_peer_access(tensor->src[1]->ne[1]); } if (params->ith != 0) { return true; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return true; } func(tensor->src[0], tensor->src[1], tensor); @@ -14517,7 +14517,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, extra->data_device[ctx->device] = tensor->data; - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; if (ggml_is_quantized(tensor->type)) { @@ -14548,7 +14548,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -14573,7 +14573,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -14809,7 +14809,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( (char *)tensor->data + offset, data, size))); @@ -14827,7 +14827,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( data, (const char *)tensor->data + offset, size))); @@ -14880,7 +14880,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph ggml_sycl_set_main_device(sycl_ctx->device); ggml_compute_params params = {}; - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -14888,13 +14888,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) continue; - assert(node->backend == GGML_BACKEND_GPU); + assert(node->backend == GGML_BACKEND_TYPE_GPU); assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_GPU); + assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU); assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->src[j]->extra != nullptr); } diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 4e5eaff15110b..6caafb82279ae 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -2320,8 +2320,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su src1_uma = d_Qy != nullptr; } - const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); @@ -2453,7 +2453,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su // compute ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data); ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13); @@ -2506,8 +2506,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context src1_uma = d_Qy != nullptr; } - const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); @@ -2630,7 +2630,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); ggml_vk_sync_buffers(subctx); @@ -2647,7 +2647,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; #endif GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -2679,7 +2679,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c src1_uma = d_Qy != nullptr; } - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const uint64_t x_ne = ne00 * ne01 * ne02; const uint64_t y_ne = ne10 * ne11 * ne12; @@ -2721,7 +2721,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) dst->data; ggml_vk_sync_buffers(subctx); @@ -2738,7 +2738,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -2771,7 +2771,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con src1_uma = d_Qy != nullptr; } - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const uint64_t d_ne = ne01 * ne11 * ne12; @@ -2814,7 +2814,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) dst->data; ggml_vk_sync_buffers(subctx); @@ -2832,7 +2832,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) && dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU); + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU); } static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { @@ -2880,8 +2880,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx // TODO: support for transposed / permuted tensors GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; @@ -3110,8 +3110,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } } - const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma; - const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; + const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0; @@ -3120,7 +3120,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c vk_buffer d_D = extra->buffer_gpu.lock(); // Workaround for tiny tensor inputs on ROPE - if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) { + if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) { y_sz = VK_WHOLE_SIZE; } @@ -3209,9 +3209,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } - if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) { + if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) { ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst); - } else if(dst->backend == GGML_BACKEND_CPU) { + } else if(dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) dst->data; ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz); @@ -3253,7 +3253,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz); } @@ -3359,7 +3359,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { // If backend is CPU, data from src0 has to be copied off the device - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; vk_buffer d_D = extra_src0->buffer_gpu.lock(); ggml_vk_sync_buffers(subctx); @@ -3994,9 +3994,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; #endif - const bool any_on_device = node->backend == GGML_BACKEND_GPU - || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU)); + const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU + || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU)); if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) { return; @@ -4215,9 +4215,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ - const bool any_on_device = node->backend == GGML_BACKEND_GPU - || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU + || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) { return; @@ -4371,7 +4371,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod last_node = true; #endif - if (node->backend == GGML_BACKEND_CPU || last_node) { + if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) { ggml_vk_ctx_end(ctx->compute_ctx); ctx->compute_ctx->exit_tensor = node; ctx->compute_ctx = nullptr; @@ -4379,9 +4379,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){ - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) { return false; @@ -4442,7 +4442,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ if (params->ith != 0) { return true; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return true; } @@ -4745,7 +4745,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; } - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; } @@ -4753,7 +4753,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; #endif - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; @@ -4768,7 +4768,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; #endif - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; @@ -4999,7 +4999,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g #endif ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -5020,7 +5020,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c #endif ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -5097,7 +5097,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml int last_node = cgraph->n_nodes - 1; // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly - while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) { + while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) { last_node -= 1; } @@ -5106,7 +5106,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml } ggml_compute_params params = {}; - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -5410,7 +5410,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { void * tensor_data = tensor->data; - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -5436,14 +5436,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso std::vector done; ggml_vk_print_graph_origin(tensor, done); - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { free(tensor_data); } } static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) { return; - GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU); if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { return; } @@ -5481,7 +5481,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ if (params->ith != 0) { return; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { return; } @@ -5518,10 +5518,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src0_buffer = malloc(src0_size); src0_clone->data = src0_buffer; - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { memcpy(src0_clone->data, src0->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (src0->backend == GGML_BACKEND_GPU) { + } else if (src0->backend == GGML_BACKEND_TYPE_GPU) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; uint64_t offset = extra->offset; if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { @@ -5561,10 +5561,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src1_buffer = malloc(src1_size); src1_clone->data = src1_buffer; - if (src1->backend == GGML_BACKEND_CPU) { + if (src1->backend == GGML_BACKEND_TYPE_CPU) { memcpy(src1_clone->data, src1->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (src1->backend == GGML_BACKEND_GPU) { + } else if (src1->backend == GGML_BACKEND_TYPE_GPU) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; uint64_t offset = extra->offset; if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { @@ -5723,7 +5723,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ if (params->ith != 0) { return; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { return; } if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { @@ -5735,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ void * tensor_data = tensor->data; - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -5868,7 +5868,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ comp_result = nullptr; comp_size = 0; - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { free(tensor_data); } } diff --git a/ggml.c b/ggml.c index c09a3cad657f2..1d81553f47106 100644 --- a/ggml.c +++ b/ggml.c @@ -2721,7 +2721,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( } } - struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); + struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here @@ -2729,7 +2729,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( *result = (struct ggml_tensor) { /*.type =*/ type, - /*.backend =*/ GGML_BACKEND_CPU, + /*.backend =*/ GGML_BACKEND_TYPE_CPU, /*.buffer =*/ NULL, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, @@ -3302,7 +3302,7 @@ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) { char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { return (struct ggml_tensor *)(mem_buffer + obj->offs); } @@ -3319,7 +3319,7 @@ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struc char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { return (struct ggml_tensor *)(mem_buffer + obj->offs); } @@ -3335,7 +3335,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); if (strcmp(cur->name, name) == 0) { return cur; @@ -5879,7 +5879,7 @@ struct ggml_tensor * ggml_top_k( int k) { GGML_ASSERT(a->ne[0] >= k); - struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC); + struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC); result = ggml_view_4d(ctx, result, k, result->ne[1], result->ne[2], result->ne[3], @@ -6673,7 +6673,7 @@ static void ggml_compute_forward_dup_same_cont( GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == dst->type); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -6705,7 +6705,7 @@ static void ggml_compute_forward_dup_f16( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -6978,7 +6978,7 @@ static void ggml_compute_forward_dup_f32( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7231,7 +7231,7 @@ static void ggml_compute_forward_dup_bytes( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(src0->type == dst->type); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7411,7 +7411,7 @@ static void ggml_compute_forward_add_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7419,7 +7419,7 @@ static void ggml_compute_forward_add_f32( const int nth = params->nth; #ifdef GGML_USE_CLBLAST - if (src1->backend == GGML_BACKEND_GPU) { + if (src1->backend == GGML_BACKEND_TYPE_GPU) { // TODO: OpenCL kernel support full broadcast GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); if (ith == 0) { @@ -7501,7 +7501,7 @@ static void ggml_compute_forward_add_f16_f32( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7580,7 +7580,7 @@ static void ggml_compute_forward_add_f16_f16( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7636,7 +7636,7 @@ static void ggml_compute_forward_add_q_f32( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7774,7 +7774,7 @@ static void ggml_compute_forward_add1_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7828,7 +7828,7 @@ static void ggml_compute_forward_add1_f16_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7880,7 +7880,7 @@ static void ggml_compute_forward_add1_f16_f16( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7932,7 +7932,7 @@ static void ggml_compute_forward_add1_q_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8062,7 +8062,7 @@ static void ggml_compute_forward_acc_f32( size_t offset = ((int32_t *) dst->op_params)[3]; bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { if (params->ith != 0) { return; } @@ -8074,7 +8074,7 @@ static void ggml_compute_forward_acc_f32( ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8176,7 +8176,7 @@ static void ggml_compute_forward_sub_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8257,14 +8257,14 @@ static void ggml_compute_forward_mul_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; #if defined(GGML_USE_CLBLAST) - if (src1->backend == GGML_BACKEND_GPU) { + if (src1->backend == GGML_BACKEND_TYPE_GPU) { // TODO: OpenCL kernel support full broadcast GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); if (ith == 0) { @@ -8365,7 +8365,7 @@ static void ggml_compute_forward_div_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8460,7 +8460,7 @@ static void ggml_compute_forward_sqr_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8506,7 +8506,7 @@ static void ggml_compute_forward_sqrt_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8552,7 +8552,7 @@ static void ggml_compute_forward_log_f32( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8598,7 +8598,7 @@ static void ggml_compute_forward_sum_f32( assert(params->ith == 0); assert(ggml_is_scalar(dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8633,7 +8633,7 @@ static void ggml_compute_forward_sum_f16( assert(params->ith == 0); assert(ggml_is_scalar(dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8690,7 +8690,7 @@ static void ggml_compute_forward_sum_rows_f32( GGML_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8745,7 +8745,7 @@ static void ggml_compute_forward_mean_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8804,7 +8804,7 @@ static void ggml_compute_forward_argmax_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8855,7 +8855,7 @@ static void ggml_compute_forward_repeat_f32( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8900,7 +8900,7 @@ static void ggml_compute_forward_repeat_f16( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8974,7 +8974,7 @@ static void ggml_compute_forward_repeat_back_f32( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(dst, src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9051,7 +9051,7 @@ static void ggml_compute_forward_concat_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9123,7 +9123,7 @@ static void ggml_compute_forward_abs_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9169,7 +9169,7 @@ static void ggml_compute_forward_sgn_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9215,7 +9215,7 @@ static void ggml_compute_forward_neg_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9261,7 +9261,7 @@ static void ggml_compute_forward_step_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9307,7 +9307,7 @@ static void ggml_compute_forward_tanh_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9353,7 +9353,7 @@ static void ggml_compute_forward_elu_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9399,7 +9399,7 @@ static void ggml_compute_forward_relu_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9446,7 +9446,7 @@ static void ggml_compute_forward_gelu_f32( GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9509,7 +9509,7 @@ static void ggml_compute_forward_gelu_quick_f32( GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9572,7 +9572,7 @@ static void ggml_compute_forward_silu_f32( GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9633,7 +9633,7 @@ static void ggml_compute_forward_leaky_relu_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9686,7 +9686,7 @@ static void ggml_compute_forward_silu_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src0, grad)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9748,7 +9748,7 @@ static void ggml_compute_forward_hardswish_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9791,7 +9791,7 @@ static void ggml_compute_forward_hardsigmoid_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9837,7 +9837,7 @@ static void ggml_compute_forward_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9912,7 +9912,7 @@ static void ggml_compute_forward_rms_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9983,7 +9983,7 @@ static void ggml_compute_forward_rms_norm_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10161,7 +10161,7 @@ static void ggml_compute_forward_group_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10328,7 +10328,7 @@ static void ggml_compute_forward_mul_mat( #if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; @@ -10341,7 +10341,7 @@ static void ggml_compute_forward_mul_mat( const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); UNUSED(desired_wsize); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (type != GGML_TYPE_F32) { assert(params->wsize >= desired_wsize); // parallelize by src0 rows @@ -10364,7 +10364,7 @@ static void ggml_compute_forward_mul_mat( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10402,7 +10402,7 @@ static void ggml_compute_forward_mul_mat( } #endif - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -10426,7 +10426,7 @@ static void ggml_compute_forward_mul_mat( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10583,7 +10583,7 @@ static void ggml_compute_forward_mul_mat_id( #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -10620,7 +10620,7 @@ static void ggml_compute_forward_mul_mat_id( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10768,7 +10768,7 @@ static void ggml_compute_forward_out_prod_f32( (ggml_is_contiguous(src1) || ggml_is_transposed(src1)); #endif - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst if (use_blas) { return; @@ -10781,7 +10781,7 @@ static void ggml_compute_forward_out_prod_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10961,7 +10961,7 @@ static void ggml_compute_forward_out_prod_q_f32( // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -10969,7 +10969,7 @@ static void ggml_compute_forward_out_prod_q_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11087,7 +11087,7 @@ static void ggml_compute_forward_scale_f32( GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11159,7 +11159,7 @@ static void ggml_compute_forward_set_f32( size_t offset = ((int32_t *) dst->op_params)[3]; bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { if (params->ith != 0) { return; } @@ -11171,7 +11171,7 @@ static void ggml_compute_forward_set_f32( ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11319,7 +11319,7 @@ static void ggml_compute_forward_get_rows_q( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11359,7 +11359,7 @@ static void ggml_compute_forward_get_rows_f16( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11396,7 +11396,7 @@ static void ggml_compute_forward_get_rows_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11499,14 +11499,14 @@ static void ggml_compute_forward_get_rows_back_f32_f16( // ggml_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (params->ith != 0) { return; } memset(dst->data, 0, ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11538,14 +11538,14 @@ static void ggml_compute_forward_get_rows_back_f32( // ggml_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (params->ith != 0) { return; } memset(dst->data, 0, ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11615,7 +11615,7 @@ static void ggml_compute_forward_diag_f32( GGML_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11684,7 +11684,7 @@ static void ggml_compute_forward_diag_mask_f32( GGML_ASSERT(n_past >= 0); - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { if (ith != 0) { return; } @@ -11698,7 +11698,7 @@ static void ggml_compute_forward_diag_mask_f32( ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11772,7 +11772,7 @@ static void ggml_compute_forward_soft_max_f32( assert(ggml_is_contiguous(dst)); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11910,7 +11910,7 @@ static void ggml_compute_forward_soft_max_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src1, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12004,7 +12004,7 @@ static void ggml_compute_forward_alibi_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12063,7 +12063,7 @@ static void ggml_compute_forward_alibi_f16( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12170,7 +12170,7 @@ static void ggml_compute_forward_clamp_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12310,7 +12310,7 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12488,7 +12488,7 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12719,7 +12719,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -12759,7 +12759,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12818,7 +12818,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -12858,7 +12858,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12962,11 +12962,11 @@ static void ggml_compute_forward_im2col_f32( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13050,11 +13050,11 @@ static void ggml_compute_forward_im2col_f16( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13136,7 +13136,7 @@ static void ggml_compute_forward_conv_transpose_2d( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -13178,7 +13178,7 @@ static void ggml_compute_forward_conv_transpose_2d( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13230,7 +13230,7 @@ static void ggml_compute_forward_pool_1d_sk_p0( assert(src->type == GGML_TYPE_F32); assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13299,7 +13299,7 @@ static void ggml_compute_forward_pool_2d( GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13372,7 +13372,7 @@ static void ggml_compute_forward_upscale_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13432,7 +13432,7 @@ static void ggml_compute_forward_pad_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13493,7 +13493,7 @@ static void ggml_compute_forward_argsort_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13519,8 +13519,8 @@ static void ggml_compute_forward_argsort_f32( // C doesn't have a functional sort, so we do a bubble sort instead for (int64_t j = 0; j < ne0; j++) { for (int64_t k = j + 1; k < ne0; k++) { - if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || - (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { + if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || + (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { int32_t tmp = dst_data[j]; dst_data[j] = dst_data[k]; dst_data[k] = tmp; @@ -13603,11 +13603,11 @@ static void ggml_compute_forward_flash_attn_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13795,11 +13795,11 @@ static void ggml_compute_forward_flash_attn_f16( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14054,11 +14054,11 @@ static void ggml_compute_forward_flash_ff_f16( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14213,14 +14213,14 @@ static void ggml_compute_forward_flash_attn_back_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith == 0) { memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); } return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14536,7 +14536,7 @@ static void ggml_compute_forward_win_part_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14602,7 +14602,7 @@ static void ggml_compute_forward_win_unpart_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14730,7 +14730,7 @@ static void ggml_compute_forward_get_rel_pos_f16( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14782,14 +14782,14 @@ static void ggml_compute_forward_add_rel_pos_f32( const struct ggml_tensor * src2 = dst->src[2]; const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; - if (!inplace && params->type == GGML_TASK_INIT) { + if (!inplace && params->type == GGML_TASK_TYPE_INIT) { if (params->ith != 0) { return; } memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); return; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14871,7 +14871,7 @@ static void ggml_compute_forward_map_unary_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14920,7 +14920,7 @@ static void ggml_compute_forward_map_binary_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14969,7 +14969,7 @@ static void ggml_compute_forward_map_custom1_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14988,7 +14988,7 @@ static void ggml_compute_forward_map_custom2_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15008,7 +15008,7 @@ static void ggml_compute_forward_map_custom3_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15023,7 +15023,7 @@ static void ggml_compute_forward_map_custom1( const struct ggml_tensor * a = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15041,7 +15041,7 @@ static void ggml_compute_forward_map_custom2( const struct ggml_tensor * a = dst->src[0]; const struct ggml_tensor * b = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15060,7 +15060,7 @@ static void ggml_compute_forward_map_custom3( const struct ggml_tensor * b = dst->src[1]; const struct ggml_tensor * c = dst->src[2]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15094,14 +15094,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32( GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith == 0) { memset(sums, 0, sizeof(float) * (nth + nth * nc)); } return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { if (ith == 0) { float * dp = (float *) dst->data; ggml_vec_sum_f32(nth, dp, sums); @@ -15216,7 +15216,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( const int64_t ith = params->ith; const int64_t nth = params->nth; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15323,8 +15323,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); #elif defined(GGML_USE_VULKAN) const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor); #ifdef GGML_VULKAN_CHECK_RESULTS @@ -15335,8 +15335,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); #endif // GGML_USE_CUBLAS #ifdef GGML_USE_SYCL @@ -16882,7 +16882,7 @@ size_t ggml_graph_overhead(void) { struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) { const size_t obj_size = ggml_graph_nbytes(size, grads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size); struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1); @@ -17429,7 +17429,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { set_numa_thread_affinity(state->ith); int node_n = -1; - int task_phase = GGML_TASK_FINALIZE; + int task_phase = GGML_TASK_TYPE_FINALIZE; while (true) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { @@ -17441,7 +17441,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_FINALIZE, + /*.type =*/ GGML_TASK_TYPE_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, /*.wsize =*/ cplan->work_size, @@ -17472,17 +17472,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (n_tasks == 1) { /* INIT */ if (GGML_OP_HAS_INIT[node->op]) { - params.type = GGML_TASK_INIT; + params.type = GGML_TASK_TYPE_INIT; ggml_compute_forward(¶ms, node); } // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; ggml_compute_forward(¶ms, node); if (GGML_OP_HAS_FINALIZE[node->op]) { - params.type = GGML_TASK_FINALIZE; + params.type = GGML_TASK_TYPE_FINALIZE; ggml_compute_forward(¶ms, node); } @@ -17496,7 +17496,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } } - task_phase = GGML_TASK_INIT; + task_phase = GGML_TASK_TYPE_INIT; atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->node_n, node_n); atomic_store(&state->shared->node_task, task_phase); @@ -17513,7 +17513,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const int n_tasks = ggml_get_n_tasks(node, n_threads); struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_INIT, + /*.type =*/ GGML_TASK_TYPE_INIT, /*.ith =*/ state->ith, /*.nth =*/ n_tasks, /*.wsize =*/ cplan->work_size, @@ -17527,7 +17527,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - task_phase = GGML_TASK_COMPUTE; + task_phase = GGML_TASK_TYPE_COMPUTE; atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->node_task, task_phase); } @@ -17542,12 +17542,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } if (state->ith < n_tasks) { - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; ggml_compute_forward(¶ms, node); } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - task_phase = GGML_TASK_FINALIZE; + task_phase = GGML_TASK_TYPE_FINALIZE; atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->node_task, task_phase); } @@ -17783,7 +17783,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { /*.n_threads =*/ n_threads, /*.n_active =*/ n_threads, /*.node_n =*/ -1, - /*.node_task =*/ GGML_TASK_FINALIZE, + /*.node_task =*/ GGML_TASK_TYPE_FINALIZE, /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, }; @@ -17851,7 +17851,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; @@ -18659,7 +18659,7 @@ static enum ggml_opt_result ggml_opt_adam( float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; bool cancel = false; @@ -18671,7 +18671,7 @@ static enum ggml_opt_result ggml_opt_adam( if (callback) { callback(callback_data, accum_step, &sched, &cancel); if (cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } } // ggml_graph_reset (gf); @@ -18762,7 +18762,7 @@ static enum ggml_opt_result ggml_opt_adam( if (callback) { callback(callback_data, accum_step, &sched, &cancel); if (cancel) { - return GGML_OPT_CANCEL;; + return GGML_OPT_RESULT_CANCEL;; } } // ggml_graph_reset (gf); @@ -18779,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_adam( if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { GGML_PRINT_DEBUG("converged\n"); - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } // delta-based convergence test @@ -18789,7 +18789,7 @@ static enum ggml_opt_result ggml_opt_adam( const float rate = (pf[(iter0 + t)%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } @@ -18805,7 +18805,7 @@ static enum ggml_opt_result ggml_opt_adam( ++n_no_improvement[0]; if (n_no_improvement[0] >= params.max_no_improvement) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } } @@ -18823,7 +18823,7 @@ static enum ggml_opt_result ggml_opt_adam( } } - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_RESULT_DID_NOT_CONVERGE; } // @@ -18904,7 +18904,7 @@ static enum ggml_opt_result linesearch_backtracking( float sched = 0; callback(callback_data, accum_step, &sched, cancel); if (*cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } } // ggml_graph_reset (gf); @@ -18977,7 +18977,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { - return GGML_OPT_INVALID_WOLFE; + return GGML_OPT_RESULT_INVALID_WOLFE; } } @@ -19006,7 +19006,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( } struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; float * x = opt->lbfgs.x->data; // current parameters @@ -19047,7 +19047,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( float sched = 0; callback(callback_data, accum_step, &sched, &cancel); if (cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } } // ggml_graph_reset (gf); @@ -19075,7 +19075,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // already optimized if (gnorm/xnorm <= params.lbfgs.eps) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } if (opt->just_initialized) { @@ -19120,7 +19120,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // way to test and don't want to break something with so many changes lined up ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); if (cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } if (ls < 0) { @@ -19143,7 +19143,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( } if (gnorm/xnorm <= params.lbfgs.eps) { // converged - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } // delta-based convergence test @@ -19153,7 +19153,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( const float rate = (pf[k[0]%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } @@ -19169,14 +19169,14 @@ static enum ggml_opt_result ggml_opt_lbfgs( n_no_improvement[0]++; if (n_no_improvement[0] >= params.max_no_improvement) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } } if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) { // reached the maximum number of iterations - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_RESULT_DID_NOT_CONVERGE; } // update vectors s and y: @@ -19232,17 +19232,17 @@ static enum ggml_opt_result ggml_opt_lbfgs( GGML_ASSERT(false && "lbfgs failed"); - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_RESULT_DID_NOT_CONVERGE; } struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { struct ggml_opt_params result; switch (type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { result = (struct ggml_opt_params) { - .type = GGML_OPT_ADAM, + .type = GGML_OPT_TYPE_ADAM, .graph_size = GGML_DEFAULT_GRAPH_SIZE, .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ? .past = 0, @@ -19270,10 +19270,10 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { }, }; } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { result = (struct ggml_opt_params) { - .type = GGML_OPT_LBFGS, + .type = GGML_OPT_TYPE_LBFGS, .graph_size = GGML_DEFAULT_GRAPH_SIZE, .n_threads = 1, .past = 0, @@ -19318,12 +19318,12 @@ GGML_API void ggml_opt_init( opt->just_initialized = true; if (opt->ctx == NULL) { struct ggml_init_params ctx_opt_params; - if (opt->params.type == GGML_OPT_ADAM) { + if (opt->params.type == GGML_OPT_TYPE_ADAM) { ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3; if (opt->params.past > 0) { ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; } - } else if (opt->params.type == GGML_OPT_LBFGS) { + } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) { ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2); if (opt->params.past > 0) { ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; @@ -19335,7 +19335,7 @@ GGML_API void ggml_opt_init( opt->ctx = ggml_init(ctx_opt_params); } switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); @@ -19349,7 +19349,7 @@ GGML_API void ggml_opt_init( ggml_set_zero(opt->adam.pf); } } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); @@ -19393,13 +19393,13 @@ enum ggml_opt_result ggml_opt( ctx = ggml_init(params_ctx); if (ctx == NULL) { - return GGML_OPT_NO_CONTEXT; + return GGML_OPT_RESULT_NO_CONTEXT; } free_ctx = true; } - enum ggml_opt_result result = GGML_OPT_OK; + enum ggml_opt_result result = GGML_OPT_RESULT_OK; struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); @@ -19438,14 +19438,14 @@ enum ggml_opt_result ggml_opt_resume_g( void * callback_data) { // build forward + backward compute graphs - enum ggml_opt_result result = GGML_OPT_OK; + enum ggml_opt_result result = GGML_OPT_RESULT_OK; switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; diff --git a/ggml.h b/ggml.h index a4166e1f7afd0..75fd035a4698f 100644 --- a/ggml.h +++ b/ggml.h @@ -364,9 +364,9 @@ extern "C" { }; enum ggml_backend_type { - GGML_BACKEND_CPU = 0, - GGML_BACKEND_GPU = 10, - GGML_BACKEND_GPU_SPLIT = 20, + GGML_BACKEND_TYPE_CPU = 0, + GGML_BACKEND_TYPE_GPU = 10, + GGML_BACKEND_TYPE_GPU_SPLIT = 20, }; // model file types @@ -498,9 +498,9 @@ extern "C" { }; enum ggml_object_type { - GGML_OBJECT_TENSOR, - GGML_OBJECT_GRAPH, - GGML_OBJECT_WORK_BUFFER + GGML_OBJECT_TYPE_TENSOR, + GGML_OBJECT_TYPE_GRAPH, + GGML_OBJECT_TYPE_WORK_BUFFER }; enum ggml_log_level { @@ -642,9 +642,9 @@ extern "C" { // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. enum ggml_task_type { - GGML_TASK_INIT = 0, - GGML_TASK_COMPUTE, - GGML_TASK_FINALIZE, + GGML_TASK_TYPE_INIT = 0, + GGML_TASK_TYPE_COMPUTE, + GGML_TASK_TYPE_FINALIZE, }; struct ggml_compute_params { @@ -1649,8 +1649,8 @@ extern "C" { // sort rows enum ggml_sort_order { - GGML_SORT_ASC, - GGML_SORT_DESC, + GGML_SORT_ORDER_ASC, + GGML_SORT_ORDER_DESC, }; GGML_API struct ggml_tensor * ggml_argsort( @@ -1943,8 +1943,8 @@ extern "C" { // optimization methods enum ggml_opt_type { - GGML_OPT_ADAM, - GGML_OPT_LBFGS, + GGML_OPT_TYPE_ADAM, + GGML_OPT_TYPE_LBFGS, }; // linesearch methods @@ -1958,12 +1958,12 @@ extern "C" { // optimization return values enum ggml_opt_result { - GGML_OPT_OK = 0, - GGML_OPT_DID_NOT_CONVERGE, - GGML_OPT_NO_CONTEXT, - GGML_OPT_INVALID_WOLFE, - GGML_OPT_FAIL, - GGML_OPT_CANCEL, + GGML_OPT_RESULT_OK = 0, + GGML_OPT_RESULT_DID_NOT_CONVERGE, + GGML_OPT_RESULT_NO_CONTEXT, + GGML_OPT_RESULT_INVALID_WOLFE, + GGML_OPT_RESULT_FAIL, + GGML_OPT_RESULT_CANCEL, GGML_LINESEARCH_FAIL = -128, GGML_LINESEARCH_MINIMUM_STEP, diff --git a/llama.cpp b/llama.cpp index 1f6b6cff48987..acd9be08a6e5e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -850,9 +850,9 @@ struct LLM_TN { // static std::map LLAMA_ROPE_SCALING_TYPES = { - { LLAMA_ROPE_SCALING_NONE, "none" }, - { LLAMA_ROPE_SCALING_LINEAR, "linear" }, - { LLAMA_ROPE_SCALING_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, + { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, }; static int32_t llama_rope_scaling_type_from_string(const std::string & name) { @@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) { } } - return LLAMA_ROPE_SCALING_UNSPECIFIED; + return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; } static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { @@ -1580,7 +1580,7 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; - uint32_t pooling_type = LLAMA_POOLING_NONE; + uint32_t pooling_type = LLAMA_POOLING_TYPE_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -2345,9 +2345,9 @@ namespace GGUFMeta { static const char * override_type_to_str(const llama_model_kv_override_type ty) { switch (ty) { - case LLAMA_KV_OVERRIDE_BOOL: return "bool"; - case LLAMA_KV_OVERRIDE_INT: return "int"; - case LLAMA_KV_OVERRIDE_FLOAT: return "float"; + case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; + case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; + case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; } return "unknown"; } @@ -2358,13 +2358,13 @@ namespace GGUFMeta { LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", __func__, override_type_to_str(override->tag), override->key); switch (override->tag) { - case LLAMA_KV_OVERRIDE_BOOL: { + case LLAMA_KV_OVERRIDE_TYPE_BOOL: { LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false"); } break; - case LLAMA_KV_OVERRIDE_INT: { + case LLAMA_KV_OVERRIDE_TYPE_INT: { LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value); } break; - case LLAMA_KV_OVERRIDE_FLOAT: { + case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { LLAMA_LOG_INFO("%.6f\n", override->float_value); } break; default: @@ -2383,7 +2383,7 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, override)) { target = override->bool_value; return true; } @@ -2393,7 +2393,7 @@ namespace GGUFMeta { template static typename std::enable_if::value && std::is_integral::value, bool>::type try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, override)) { target = override->int_value; return true; } @@ -2403,7 +2403,7 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type try_override(T & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, override)) { target = override->float_value; return true; } @@ -2999,7 +2999,7 @@ static void llm_load_hparams( std::string rope_scaling("linear"); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); - GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); + GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED); // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; @@ -3643,7 +3643,7 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } - if (split_mode == LLAMA_SPLIT_LAYER) { + if (split_mode == LLAMA_SPLIT_MODE_LAYER) { // calculate the split points int device_count = llama_get_device_count(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); @@ -3682,10 +3682,10 @@ static bool llm_load_tensors( } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_ROW) { + if (split_mode == LLAMA_SPLIT_MODE_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); } else { - // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported + // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(main_gpu); } // assign the repeating layers @@ -5070,7 +5070,7 @@ struct llm_build_context { kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), - pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE), + pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_TYPE_NONE), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -6050,12 +6050,12 @@ struct llm_build_context { cur = inpL; // pooling layer - if (pooling_type == LLAMA_POOLING_MEAN) { + if (pooling_type == LLAMA_POOLING_TYPE_MEAN) { cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); - } else if (pooling_type == LLAMA_POOLING_CLS) { + } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) { cur = ggml_get_rows(ctx0, cur, inp_cls); } else { - GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type"); + GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type"); } cb(cur, "result_embd", -1); @@ -7754,7 +7754,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); @@ -7782,7 +7782,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -11351,7 +11351,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, - /*.split_mode =*/ LLAMA_SPLIT_LAYER, + /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -11377,7 +11377,7 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, - /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, /*.yarn_ext_factor =*/ -1.0f, @@ -11565,16 +11565,16 @@ struct llama_context * llama_new_context_with_model( cparams.cb_eval_user_data = params.cb_eval_user_data; auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { rope_scaling_type = hparams.rope_scaling_type_train; } - if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none } if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } if (params.seed == LLAMA_DEFAULT_SEED) { @@ -11608,8 +11608,8 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { - // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); @@ -11618,7 +11618,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } else { - // LLAMA_SPLIT_LAYER requires a backend for each GPU + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { diff --git a/llama.h b/llama.h index 889edf4d97b96..947284ea2f535 100644 --- a/llama.h +++ b/llama.h @@ -109,23 +109,23 @@ extern "C" { }; enum llama_rope_scaling_type { - LLAMA_ROPE_SCALING_UNSPECIFIED = -1, - LLAMA_ROPE_SCALING_NONE = 0, - LLAMA_ROPE_SCALING_LINEAR = 1, - LLAMA_ROPE_SCALING_YARN = 2, - LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, + LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_TYPE_NONE = 0, + LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, + LLAMA_ROPE_SCALING_TYPE_YARN = 2, + LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, }; enum llama_pooling_type { - LLAMA_POOLING_NONE = 0, - LLAMA_POOLING_MEAN = 1, - LLAMA_POOLING_CLS = 2, + LLAMA_POOLING_TYPE_NONE = 0, + LLAMA_POOLING_TYPE_MEAN = 1, + LLAMA_POOLING_TYPE_CLS = 2, }; enum llama_split_mode { - LLAMA_SPLIT_NONE = 0, // single GPU - LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_MODE_NONE = 0, // single GPU + LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs }; typedef struct llama_token_data { @@ -173,9 +173,9 @@ extern "C" { } llama_batch; enum llama_model_kv_override_type { - LLAMA_KV_OVERRIDE_INT, - LLAMA_KV_OVERRIDE_FLOAT, - LLAMA_KV_OVERRIDE_BOOL, + LLAMA_KV_OVERRIDE_TYPE_INT, + LLAMA_KV_OVERRIDE_TYPE_FLOAT, + LLAMA_KV_OVERRIDE_TYPE_BOOL, }; struct llama_model_kv_override { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f8574588bcf2f..24d12ef141efd 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1264,7 +1264,7 @@ struct test_argsort : public test_case { test_argsort(ggml_type type = GGML_TYPE_F32, std::array ne = {16, 10, 10, 10}, - ggml_sort_order order = GGML_SORT_ASC) + ggml_sort_order order = GGML_SORT_ORDER_ASC) : type(type), ne(ne), order(order) {} ggml_tensor * build_graph(ggml_context * ctx) override { @@ -2116,7 +2116,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_concat(GGML_TYPE_F32)); test_cases.emplace_back(new test_concat(GGML_TYPE_I32)); - for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) { + for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); } diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index 2c9997fca7705..546ca230ba417 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -118,7 +118,7 @@ int main(void) { const float fe = ggml_get_f32_1d(e, 0); printf("%s: e = %.4f\n", __func__, fe); - struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM); + struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); ggml_opt(ctx, opt_params, e); From 12894088170f62e4cad4f8d6a3043c185b414bab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= Date: Sun, 25 Feb 2024 11:53:11 +0100 Subject: [PATCH 41/51] cmake : fix compilation for Android armeabi-v7a (#5702) --- CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c4629001a38d..48880f7204bf5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -936,10 +936,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a # Raspberry Pi 3, 4, Zero 2 (32-bit) list(APPEND ARCH_FLAGS -mno-unaligned-access) endif() From d52d7819b8ced70c642a88a59da8c78208dc58ec Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 25 Feb 2024 13:49:43 +0100 Subject: [PATCH 42/51] server: concurrency fix + monitoring - add /metrics prometheus compatible endpoint (#5708) * server: monitoring - add /metrics prometheus compatible endpoint * server: concurrency issue, when 2 task are waiting for results, only one call thread is notified * server: metrics - move to a dedicated struct --- examples/server/README.md | 13 ++ examples/server/server.cpp | 150 +++++++++++++++++- examples/server/tests/features/environment.py | 2 + examples/server/tests/features/server.feature | 2 + examples/server/tests/features/steps/steps.py | 27 ++++ examples/server/tests/requirements.txt | 1 + examples/server/utils.hpp | 4 +- 7 files changed, 191 insertions(+), 8 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 0c43ac4c97cba..2129f7fb2b463 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` - `-n, --n-predict`: Set the maximum tokens to predict (default: -1) - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. +- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled) - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) ## Build @@ -457,6 +458,18 @@ Notice that each `probs` is an array of length `n_probs`. ] ``` +- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled: + +Available metrics: +- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed. +- `llamacpp:tokens_predicted_total`: Number of generation tokens processed. +- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s. +- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s. +- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage. +- `llamacpp:kv_cache_tokens`: KV-cache tokens. +- `llamacpp:requests_processing`: Number of request processing. +- `llamacpp:requests_deferred`: Number of request deferred. + ## More examples ### Change system prompt on runtime diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 780862ef67810..811495915f6dd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -43,6 +43,7 @@ struct server_params int32_t read_timeout = 600; int32_t write_timeout = 600; bool slots_endpoint = true; + bool metrics_endpoint = false; }; bool server_verbose = false; @@ -310,6 +311,39 @@ struct llama_client_slot } }; +struct llama_metrics { + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t n_tokens_predicted_total = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + + void on_prompt_eval(const llama_client_slot &slot) { + n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; + + n_prompt_tokens_processed += slot.num_prompt_tokens_processed; + t_prompt_processing += slot.t_prompt_processing; + } + + void on_prediction(const llama_client_slot &slot) { + n_tokens_predicted_total += slot.n_decoded; + + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; + } + + void reset_bucket() { + n_prompt_tokens_processed = 0; + t_prompt_processing = 0; + n_tokens_predicted = 0; + t_tokens_generation = 0; + } +}; + struct llama_server_context { llama_model *model = nullptr; @@ -344,6 +378,8 @@ struct llama_server_context llama_server_queue queue_tasks; llama_server_response queue_results; + llama_metrics metrics; + ~llama_server_context() { if (ctx) @@ -1404,7 +1440,7 @@ struct llama_server_context case TASK_TYPE_NEXT_RESPONSE: { // do nothing } break; - case TASK_TYPE_SLOTS_DATA: { + case TASK_TYPE_METRICS: { json slots_data = json::array(); int n_idle_slots = 0; int n_processing_slots = 0; @@ -1438,10 +1474,24 @@ struct llama_server_context res.stop = true; res.error = false; res.result_json = { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "slots", slots_data } + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "deferred", queue_tasks.queue_tasks_deferred.size() }, + + { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total}, + { "n_tokens_predicted_total", metrics.n_tokens_predicted_total}, + + { "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed}, + { "t_prompt_processing", metrics.t_prompt_processing}, + { "n_tokens_predicted", metrics.n_tokens_predicted}, + { "t_tokens_generation", metrics.t_tokens_generation}, + + { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, + { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, + + { "slots", slots_data }, }; + metrics.reset_bucket(); queue_results.send(res); } break; } @@ -1849,6 +1899,7 @@ struct llama_server_context { slot.t_start_genereration = ggml_time_us(); slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); } llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; @@ -1871,6 +1922,7 @@ struct llama_server_context slot.release(); slot.print_timings(); send_final_response(slot); + metrics.on_prediction(slot); } slot.i_batch = -1; @@ -1955,6 +2007,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --log-disable disables logging to a file.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); + printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled"); printf("\n"); printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" --override-kv KEY=TYPE:VALUE\n"); @@ -2414,6 +2467,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, { sparams.slots_endpoint = false; } + else if (arg == "--metrics") + { + sparams.metrics_endpoint = true; + } else if (arg == "--chat-template") { if (++i >= argc) @@ -2621,7 +2678,7 @@ int main(int argc, char **argv) // request slots data using task queue task_server task; task.id = llama.queue_tasks.get_new_id(); - task.type = TASK_TYPE_SLOTS_DATA; + task.type = TASK_TYPE_METRICS; task.target_id = -1; llama.queue_results.add_waiting_task_id(task.id); @@ -2668,7 +2725,7 @@ int main(int argc, char **argv) // request slots data using task queue task_server task; task.id = llama.queue_tasks.get_new_id(); - task.type = TASK_TYPE_SLOTS_DATA; + task.type = TASK_TYPE_METRICS; task.target_id = -1; llama.queue_results.add_waiting_task_id(task.id); @@ -2683,6 +2740,87 @@ int main(int argc, char **argv) }); } + if (sparams.metrics_endpoint) { + svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) { + // request slots data using task queue + task_server task; + task.id = llama.queue_tasks.get_new_id(); + task.type = TASK_TYPE_METRICS; + task.target_id = -1; + + llama.queue_results.add_waiting_task_id(task.id); + llama.queue_tasks.post(task); + + // get the result + task_result result = llama.queue_results.recv(task.id); + llama.queue_results.remove_waiting_task_id(task.id); + + json data = result.result_json; + + uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"]; + uint64_t t_prompt_processing = data["t_prompt_processing"]; + + uint64_t n_tokens_predicted = data["n_tokens_predicted"]; + uint64_t t_tokens_generation = data["t_tokens_generation"]; + + int32_t kv_cache_used_cells = data["kv_cache_used_cells"]; + + // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names + json all_metrics_def = json { + {"counter", {{ + {"name", "prompt_tokens_total"}, + {"help", "Number of prompt tokens processed."}, + {"value", data["n_prompt_tokens_processed_total"]} + }, { + {"name", "tokens_predicted_total"}, + {"help", "Number of generation tokens processed."}, + {"value", data["n_tokens_predicted_total"]} + }}}, + {"gauge", {{ + {"name", "prompt_tokens_seconds"}, + {"help", "Average prompt throughput in tokens/s."}, + {"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0} + },{ + {"name", "predicted_tokens_seconds"}, + {"help", "Average generation throughput in tokens/s."}, + {"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0} + },{ + {"name", "kv_cache_usage_ratio"}, + {"help", "KV-cache usage. 1 means 100 percent usage."}, + {"value", 1. * kv_cache_used_cells / params.n_ctx} + },{ + {"name", "kv_cache_tokens"}, + {"help", "KV-cache tokens."}, + {"value", data["kv_cache_tokens_count"]} + },{ + {"name", "requests_processing"}, + {"help", "Number of request processing."}, + {"value", data["processing"]} + },{ + {"name", "requests_deferred"}, + {"help", "Number of request deferred."}, + {"value", data["deferred"]} + }}} + }; + + std::stringstream prometheus; + for (const auto& el : all_metrics_def.items()) { + const auto& type = el.key(); + const auto& metrics_def = el.value(); + for (const auto& metric_def : metrics_def) { + std::string name = metric_def["name"]; + std::string help = metric_def["help"]; + prometheus << "# HELP llamacpp:" << name << " " << help << "\n" + << "# TYPE llamacpp:" << name << " " << type << "\n" + << "llamacpp:" << name << " " << metric_def["value"] << "\n"; + } + } + + res.set_content(prometheus.str(), "text/plain; version=0.0.4"); + res.status = 200; // HTTP OK + }); + } + svr.set_logger(log_server_request); svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep) diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 13cc841017f62..09e8267476135 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -16,6 +16,8 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): + if context.server_process is None: + return if scenario.status == "failed": if 'GITHUB_ACTIONS' in os.environ: print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 5f81d256a548c..0139f89d831a2 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -13,6 +13,7 @@ Feature: llama.cpp server And 1 slots And embeddings extraction And 32 server max tokens to predict + And prometheus compatible metrics exposed Then the server is starting Then the server is healthy @@ -25,6 +26,7 @@ Feature: llama.cpp server And max tokens to predict And a completion request with no api error Then tokens are predicted matching + And prometheus metrics are exposed Examples: Prompts | prompt | n_predict | re_content | n_predicted | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 9c825fdbcd7f5..051fd440c3391 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -13,6 +13,7 @@ import openai from behave import step from behave.api.async_step import async_run_until_complete +from prometheus_client import parser @step(u"a server listening on {server_fqdn}:{server_port}") @@ -34,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port): context.server_api_key = None context.server_continuous_batching = False context.server_embeddings = False + context.server_metrics = False + context.server_process = None context.server_seed = None context.user_api_key = None @@ -82,6 +85,11 @@ def step_server_embeddings(context): context.server_embeddings = True +@step(u'prometheus compatible metrics exposed') +def step_server_metrics(context): + context.server_metrics = True + + @step(u"the server is starting") def step_start_server(context): start_server_background(context) @@ -424,6 +432,23 @@ def step_check_options_header_value(context, cors_header, cors_header_value): assert context.options_response.headers[cors_header] == cors_header_value +@step(u'prometheus metrics are exposed') +@async_run_until_complete +async def step_prometheus_metrics_exported(context): + async with aiohttp.ClientSession() as session: + async with await session.get(f'{context.base_url}/metrics') as metrics_response: + assert metrics_response.status == 200 + assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" + metrics_raw = await metrics_response.text() + metric_exported = False + for metric in parser.text_string_to_metric_families(metrics_raw): + match metric.name: + case "llamacpp:kv_cache_usage_ratio": + assert len(metric.samples) > 0 + metric_exported = True + assert metric_exported, "No metrics exported" + + async def concurrent_requests(context, f_completion, *args, **kwargs): n_prompts = len(context.prompts) if context.debug: @@ -753,6 +778,8 @@ def start_server_background(context): server_args.append('--cont-batching') if context.server_embeddings: server_args.append('--embedding') + if context.server_metrics: + server_args.append('--metrics') if context.model_alias is not None: server_args.extend(['--alias', context.model_alias]) if context.n_ctx is not None: diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 3e51b12dc8207..334fa4a70ea72 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,3 +1,4 @@ aiohttp~=3.9.3 behave~=1.2.6 openai~=0.25.0 +prometheus-client~=0.20.0 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 88545eb6931d0..71cc5b0b8b6de 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -50,7 +50,7 @@ enum task_type { TASK_TYPE_COMPLETION, TASK_TYPE_CANCEL, TASK_TYPE_NEXT_RESPONSE, - TASK_TYPE_SLOTS_DATA + TASK_TYPE_METRICS }; struct task_server { @@ -441,7 +441,7 @@ struct llama_server_response { { LOG_VERBOSE("queue_results.push_back", {}); queue_results.push_back(result); - condition_results.notify_one(); + condition_results.notify_all(); return; } } From 930b1780269a69948d106e2d1b838ab7661f679a Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 25 Feb 2024 13:50:32 +0100 Subject: [PATCH 43/51] server: logs - unified format and --log-format option (#5700) * server: logs - always use JSON logger, add add thread_id in message, log task_id and slot_id * server : skip GH copilot requests from logging * server : change message format of server_log() * server : no need to repeat log in comment * server : log style consistency * server : fix compile warning * server : fix tests regex patterns on M2 Ultra * server: logs: PR feedback on log level * server: logs: allow to choose log format in json or plain text * server: tests: output server logs in text * server: logs switch init logs to server logs macro * server: logs ensure value json value does not raised error * server: logs reduce level VERBOSE to VERB to max 4 chars * server: logs lower case as other log messages * server: logs avoid static in general Co-authored-by: Georgi Gerganov * server: logs PR feedback: change text log format to: LEVEL [function_name] message | additional=data --------- Co-authored-by: Georgi Gerganov --- examples/server/README.md | 4 +- examples/server/server.cpp | 218 ++++++++++++++---- examples/server/tests/README.md | 1 + examples/server/tests/features/server.feature | 6 +- examples/server/tests/features/steps/steps.py | 2 + examples/server/utils.hpp | 80 ++++--- 6 files changed, 231 insertions(+), 80 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 2129f7fb2b463..cb3fd6054095b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -39,10 +39,12 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` -- `-n, --n-predict`: Set the maximum tokens to predict (default: -1) +- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1) - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. - `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled) - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) +- `--log-disable`: Output logs to stdout only, default: enabled. +- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json) ## Build diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 811495915f6dd..d970202d2b5d3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -47,6 +47,7 @@ struct server_params }; bool server_verbose = false; +bool server_log_json = true; static size_t common_part(const std::vector &a, const std::vector &b) { @@ -302,12 +303,43 @@ struct llama_client_slot } void print_timings() const { - LOG_TEE("\n"); - LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed); - LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded); - LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); + char buffer[512]; + double t_token = t_prompt_processing / num_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", + t_prompt_processing, num_prompt_tokens_processed, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"num_prompt_tokens_processed", num_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + t_token = t_token_generation / n_decoded; + n_tokens_second = 1e3 / t_token_generation * n_decoded; + sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + t_token_generation, n_decoded, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_token_generation", t_token_generation}, + {"n_decoded", n_decoded}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"t_token_generation", t_token_generation}, + {"t_total", t_prompt_processing + t_token_generation}, + }); } }; @@ -399,7 +431,7 @@ struct llama_server_context params = params_; if (!params.mmproj.empty()) { multimodal = true; - LOG_TEE("Multi Modal Mode Enabled"); + LOG_INFO("Multi Modal Mode Enabled", {}); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); if(clp_ctx == nullptr) { LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); @@ -452,7 +484,7 @@ struct llama_server_context const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_TEE("Available slots:\n"); + LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); for (int i = 0; i < params.n_parallel; i++) { llama_client_slot slot; @@ -461,7 +493,10 @@ struct llama_server_context slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + LOG_INFO("new slot", { + {"slot_id", slot.id}, + {"n_ctx_slot", slot.n_ctx} + }); const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; @@ -471,7 +506,12 @@ struct llama_server_context GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); + + LOG_INFO("slot self-extend", { + {"slot_id", slot.id}, + {"ga_n", ga_n}, + {"ga_w", ga_w} + }); } slot.ga_i = 0; @@ -765,10 +805,16 @@ struct llama_server_context img_sl.img_data = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); + LOG_ERROR("failed to load image", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); return false; } - LOG_TEE("slot %i - loaded image\n", slot->id); + LOG_VERBOSE("image loaded", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -828,7 +874,10 @@ struct llama_server_context all_slots_are_idle = false; - LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + LOG_INFO("slot is processing task", { + {"slot_id", slot->id}, + {"task_id", slot->task_id}, + }); return true; } @@ -1391,7 +1440,7 @@ struct llama_server_context if (slot == nullptr) { // if no slot is available, we defer this task for processing later - LOG_VERBOSE("no slot is available", {}); + LOG_VERBOSE("no slot is available", {{"task_id", task.id}}); queue_tasks.defer(task); break; } @@ -1467,7 +1516,17 @@ struct llama_server_context } slots_data.push_back(slot_data); } - LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); + LOG_INFO("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots} + }); + LOG_VERBOSE("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots}, + {"slots", slots_data} + }); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; @@ -1519,7 +1578,7 @@ struct llama_server_context bool update_slots() { if (system_need_update) { - LOG_TEE("updating system prompt\n"); + LOG_INFO("updating system prompt", {}); update_system_prompt(); } @@ -1529,12 +1588,13 @@ struct llama_server_context { if (system_prompt.empty() && clean_kv_cache) { - LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); + LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); kv_cache_clear(); } return true; } + LOG_VERBOSE("posting NEXT_RESPONSE", {}); task_server task; task.type = TASK_TYPE_NEXT_RESPONSE; task.target_id = -1; @@ -1548,10 +1608,20 @@ struct llama_server_context { // Shift context const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = system_tokens.size() + slot.n_past - n_keep; + const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); + LOG_INFO("slot context shift", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_keep", n_keep}, + {"n_left", n_left}, + {"n_discard", n_discard}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()} + }); llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); @@ -1565,17 +1635,12 @@ struct llama_server_context slot.n_past -= n_discard; slot.truncated = true; - - LOG_VERBOSE("context shift", { - { "n_ctx", n_ctx }, - { "n_keep", n_keep }, - { "n_left", n_left }, - }); } } } // decode any currently ongoing sequences + LOG_VERBOSE("decoding ongoing sequences", {}); for (auto & slot : slots) { // release the slot @@ -1585,7 +1650,15 @@ struct llama_server_context slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); + LOG_INFO("slot released", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()}, + {"truncated", slot.truncated} + }); queue_tasks.notify_slot_changed(); continue; @@ -1733,7 +1806,12 @@ struct llama_server_context slot.ga_i = ga_i; } - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); + LOG_INFO("slot progression", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "n_past", slot.n_past }, + { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } + }); } slot.cache_tokens = prompt_tokens; @@ -1741,7 +1819,10 @@ struct llama_server_context if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); + LOG_INFO("we have to evaluate at least 1 token to generate logits", { + { "slot_id", slot.id }, + { "task_id", slot.task_id } + }); slot.n_past--; if (slot.ga_i > 0) { @@ -1749,9 +1830,13 @@ struct llama_server_context } } - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); + int p0 = (int) system_tokens.size() + slot.n_past; + LOG_INFO("kv cache rm [p0, end)", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "p0", p0 } + }); + llama_kv_cache_seq_rm(ctx, slot.id, p0, -1); LOG_VERBOSE("prompt ingested", { {"n_past", slot.n_past}, @@ -1786,7 +1871,13 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { - LOG_TEE("failed processing images\n"); + LOG_ERROR("failed processing images", { + "slot_id", slot.id, + "task_id", slot.task_id, + }); + // FIXME @phymbert: to be properly tested + // early returning without changing the slot state will block the slot for ever + // no one at the moment is checking the return value return false; } @@ -1928,6 +2019,8 @@ struct llama_server_context slot.i_batch = -1; } } + + LOG_VERBOSE("slots updated", {}); return true; } @@ -2005,6 +2098,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -ctv TYPE, --cache-type-v TYPE\n"); printf(" KV cache data type for V (default: f16)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); + printf(" --log-format log output format: json or text (default: json)\n"); printf(" --log-disable disables logging to a file.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled"); @@ -2458,6 +2552,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.mmproj = argv[i]; } + else if (arg == "--log-format") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + if (std::strcmp(argv[i], "json") == 0) + { + server_log_json = true; + } + else if (std::strcmp(argv[i], "text") == 0) + { + server_log_json = false; + } + else + { + invalid_param = true; + break; + } + } else if (arg == "--log-disable") { log_set_target(stdout); @@ -2571,32 +2686,40 @@ static json format_partial_response( static json format_tokenizer_response(const std::vector &tokens) { - return json{ - {"tokens", tokens}}; + return json { + {"tokens", tokens} + }; } static json format_detokenized_response(std::string content) { - return json{ - {"content", content}}; + return json { + {"content", content} + }; } static void log_server_request(const httplib::Request &req, const httplib::Response &res) { + // skip GH copilot requests when using default port + if (req.path == "/v1/health" || req.path == "/v1/completions") + { + return; + } + LOG_INFO("request", { - {"remote_addr", req.remote_addr}, - {"remote_port", req.remote_port}, - {"status", res.status}, - {"method", req.method}, - {"path", req.path}, - {"params", req.params}, - }); + {"remote_addr", req.remote_addr}, + {"remote_port", req.remote_port}, + {"status", res.status}, + {"method", req.method}, + {"path", req.path}, + {"params", req.params}, + }); LOG_VERBOSE("request", { - {"request", req.body}, - {"response", res.body}, - }); + {"request", req.body}, + {"response", res.body}, + }); } struct token_translator @@ -2873,9 +2996,6 @@ int main(int argc, char **argv) // Set the base directory for serving static files svr.set_base_dir(sparams.public_path); - // to make it ctrl+clickable: - LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); - std::unordered_map log_data; log_data["hostname"] = sparams.hostname; log_data["port"] = std::to_string(sparams.port); diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index e44c5c286601f..0b9fdc4e72678 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -32,6 +32,7 @@ It's possible to override some scenario steps values with environment variables: - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080` - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose` + - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format ### Run @bug, @wip or @wrong_usage annotated scenario diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 0139f89d831a2..b571582a7857e 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -29,9 +29,9 @@ Feature: llama.cpp server And prometheus metrics are exposed Examples: Prompts - | prompt | n_predict | re_content | n_predicted | - | I believe the meaning of life is | 8 | read | 8 | - | Write a joke about AI | 64 | (parkfriendsscared)+ | 32 | + | prompt | n_predict | re_content | n_predicted | + | I believe the meaning of life is | 8 | (readgoing)+ | 8 | + | Write a joke about AI | 64 | (parkfriendsscaredalways)+ | 32 | Scenario Outline: OAI Compatibility Given a model diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 051fd440c3391..8e4babf204f8a 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -792,6 +792,8 @@ def start_server_background(context): server_args.extend(['--api-key', context.server_api_key]) if context.debug: server_args.append('--verbose') + if 'SERVER_LOG_FORMAT_JSON' not in os.environ: + server_args.extend(['--log-format', "text"]) print(f"starting server with: {context.server_path}", *server_args) context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 71cc5b0b8b6de..d7abd7cbba71c 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -14,6 +14,7 @@ using json = nlohmann::json; extern bool server_verbose; +extern bool server_log_json; #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -27,14 +28,14 @@ extern bool server_verbose; { \ if (server_verbose) \ { \ - server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \ } \ } while (0) #endif -#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) // // parallel @@ -133,26 +134,48 @@ struct completion_token_output std::string text_to_send; }; -static inline void server_log(const char *level, const char *function, int line, - const char *message, const nlohmann::ordered_json &extra) +static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { - nlohmann::ordered_json log - { + std::stringstream ss_tid; + ss_tid << std::this_thread::get_id(); + json log = nlohmann::ordered_json{ + {"tid", ss_tid.str()}, {"timestamp", time(nullptr)}, - {"level", level}, - {"function", function}, - {"line", line}, - {"message", message}, }; - if (!extra.empty()) - { - log.merge_patch(extra); - } + if (server_log_json) { + log.merge_patch( + { + {"level", level}, + {"function", function}, + {"line", line}, + {"msg", message}, + }); + if (!extra.empty()) { + log.merge_patch(extra); + } + + std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; + } else { + char buf[1024]; + snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); - printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); + if (!extra.empty()) { + log.merge_patch(extra); + } + std::stringstream ss; + ss << buf << " |"; + for (const auto& el : log.items()) + { + const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); + snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str()); + ss << buf; + } + + const std::string str = ss.str(); + printf("%.*s\n", (int)str.size(), str.data()); + fflush(stdout); + } } // @@ -234,6 +257,7 @@ struct llama_server_queue { std::unique_lock lock(mutex_tasks); if (task.id == -1) { task.id = id++; + LOG_VERBOSE("new task id", {{"new_id", task.id}}); } queue_tasks.push_back(std::move(task)); condition_tasks.notify_one(); @@ -249,7 +273,9 @@ struct llama_server_queue { // Get the next id for creating anew task int get_new_id() { std::unique_lock lock(mutex_tasks); - return id++; + int new_id = id++; + LOG_VERBOSE("new task id", {{"new_id", new_id}}); + return new_id; } // Register function to process a new task @@ -290,8 +316,7 @@ struct llama_server_queue { void start_loop() { running = true; while (true) { - // new task arrived - LOG_VERBOSE("have new task", {}); + LOG_VERBOSE("new task may arrive", {}); { while (true) { @@ -303,7 +328,7 @@ struct llama_server_queue { task_server task = queue_tasks.front(); queue_tasks.erase(queue_tasks.begin()); lock.unlock(); - LOG_VERBOSE("callback_new_task", {}); + LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); callback_new_task(task); } LOG_VERBOSE("callback_all_task_finished", {}); @@ -384,11 +409,13 @@ struct llama_server_response { std::condition_variable condition_results; void add_waiting_task_id(int task_id) { + LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); } void remove_waiting_task_id(int task_id) { + LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.erase(task_id); } @@ -401,7 +428,6 @@ struct llama_server_response { condition_results.wait(lock, [&]{ return !queue_results.empty(); }); - LOG_VERBOSE("condition_results unblock", {}); for (int i = 0; i < (int) queue_results.size(); i++) { @@ -426,20 +452,20 @@ struct llama_server_response { // Send a new result to a waiting task_id void send(task_result result) { std::unique_lock lock(mutex_results); - LOG_VERBOSE("send new result", {}); + LOG_VERBOSE("send new result", {{"task_id", result.id}}); for (auto& task_id : waiting_task_ids) { // LOG_TEE("waiting task id %i \n", task_id); // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result if (result.multitask_id == task_id) { - LOG_VERBOSE("callback_update_multitask", {}); + LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}}); callback_update_multitask(task_id, result.id, result); continue; } if (result.id == task_id) { - LOG_VERBOSE("queue_results.push_back", {}); + LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}}); queue_results.push_back(result); condition_results.notify_all(); return; From 7d548a1827f6fc6aece6db74c9d112da42c40d68 Mon Sep 17 00:00:00 2001 From: Ashok Gelal <401055+ashokgelal@users.noreply.github.com> Date: Sun, 25 Feb 2024 10:57:34 -0500 Subject: [PATCH 44/51] readme : add Msty to UI list (#5618) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3bc512af0602b..d61f9171b1b62 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [semperai/amica](https://github.com/semperai/amica) - [withcatai/catai](https://github.com/withcatai/catai) - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) +- [Msty](https://msty.app) (proprietary) --- From f1a98c52546d009f742bdec2154c2a314ea950a6 Mon Sep 17 00:00:00 2001 From: kwin1412 <42286931+kwin1412@users.noreply.github.com> Date: Mon, 26 Feb 2024 00:46:49 +0800 Subject: [PATCH 45/51] make : fix nvcc version is empty (#5713) fix nvcc version is empty --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f03faf6eda0fb..068f6ed028460 100644 --- a/Makefile +++ b/Makefile @@ -597,7 +597,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1)) ifdef LLAMA_CUBLAS $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) -CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])') +CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifndef CUDA_DOCKER_ARCH ifndef CUDA_POWER_ARCH From abbabc5e51d0d4656b438aec10b7fae9479ef37d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= Date: Sun, 25 Feb 2024 19:43:00 +0100 Subject: [PATCH 46/51] ggml-quants : provide ggml_vqtbl1q_u8 for 64bit compatibility (#5711) * [ggml-quants] Provide ggml_vqtbl1q_u8 for 64bit compatibility vqtbl1q_u8 is not part of arm v7 neon library * [android-example] Remove abi filter after arm v7a fix * [github-workflows] Do not skip Android armeabi-v7a build --- .github/workflows/build.yml | 3 +- examples/llama.android/app/build.gradle.kts | 8 ++--- ggml-quants.c | 33 ++++++++++++++++++--- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 03d76d45560cf..66ad85938ca16 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -669,8 +669,7 @@ jobs: run: | cd examples/llama.android - # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820). - ./gradlew build --no-daemon -Pskip-armeabi-v7a + ./gradlew build --no-daemon # freeBSD-latest: # runs-on: macos-12 diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts index aadbe22c91835..d42140efe8168 100644 --- a/examples/llama.android/app/build.gradle.kts +++ b/examples/llama.android/app/build.gradle.kts @@ -21,12 +21,8 @@ android { useSupportLibrary = true } ndk { - // Workaround for https://github.com/llvm/llvm-project/issues/65820 - // affecting armeabi-v7a. Skip armeabi-v7a when invoked with - // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a). - if (project.hasProperty("skip-armeabi-v7a")) { - abiFilters += listOf("arm64-v8a", "x86_64", "x86") - } + // Add NDK properties if wanted, e.g. + // abiFilters += listOf("arm64-v8a") } externalNativeBuild { cmake { diff --git a/ggml-quants.c b/ggml-quants.c index 5c5f2ce1b9b87..3d94d166d1b6d 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { return res; } +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + #else #define ggml_int16x8x2_t int16x8x2_t @@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { #define ggml_vld1q_s8_x2 vld1q_s8_x2 #define ggml_vld1q_s8_x4 vld1q_s8_x4 #define ggml_vqtbl1q_s8 vqtbl1q_s8 +#define ggml_vqtbl1q_u8 vqtbl1q_u8 #endif @@ -9488,8 +9513,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v qs += 16; vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16))); - vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); vs.val[0] = vceqq_u8(vs.val[0], mask2); vs.val[1] = vceqq_u8(vs.val[1], mask2); @@ -9497,8 +9522,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1])); vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16))); - vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); vs.val[0] = vceqq_u8(vs.val[0], mask2); vs.val[1] = vceqq_u8(vs.val[1], mask2); From f7625019c51ca437a5840576d92362cfa710e4a2 Mon Sep 17 00:00:00 2001 From: compilade <113953597+compilade@users.noreply.github.com> Date: Sun, 25 Feb 2024 13:43:50 -0500 Subject: [PATCH 47/51] server : fix crash when system prompt is bigger than batch size (#5714) The system prompt is now decoded in batches. * server : fix off-by-one n_past when start of prompt matches whole cache The tokens right after the matching part would otherwise skip a pos value. --- examples/server/server.cpp | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d970202d2b5d3..c1eb61678c38a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -902,10 +902,24 @@ struct llama_server_context llama_batch_add(batch, system_tokens[i], i, { 0 }, false); } - if (llama_decode(ctx, batch) != 0) + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return; + const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i)); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + if (llama_decode(ctx, batch_view) != 0) + { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return; + } } // assign the system KV cache to all parallel sequences @@ -1785,6 +1799,14 @@ struct llama_server_context } slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + + // the last token of the cache is not in the KV cache until the next call to llama_decode + // (it was sampled, pushed into the "cache_tokens", but not yet put in the context) + if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size()) + { + slot.n_past -= 1; + } + slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; if (slot.ga_n != 1) From bf08e00643fd529f748f0a858fd79f3061e3fa18 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 22:12:24 +0200 Subject: [PATCH 48/51] llama : refactor k-shift implementation + KV defragmentation (#5691) * llama : refactor k-shift implementation ggml-ci * llama : rename llama_kv_cache_seq_shift to llama_kv_cache_seq_add * llama : cont k-shift refactoring + normalize type names ggml-ci * minor : fix MPI builds * llama : reuse n_rot from the build context ggml-ci * llama : revert enum name changes from this PR ggml-ci * llama : update llama_rope_type * llama : add comment about rope values * llama : fix build * passkey : apply kv cache updates explicitly ggml-ci * llama : change name to llama_kv_cache_update() * llama : add llama_kv_cache_seq_pos_max() * passkey : fix llama_kv_cache_seq_pos_max() usage * llama : some llama_kv_cell simplifications * llama : add llama_kv_cache_compress (EXPERIMENTAL) * llama : add alternative KV cache merging (EXPERIMENTAL) * llama : add llama_kv_cache_defrag * llama : comments * llama : remove llama_kv_cache_compress will add in a separate PR ggml-ci * llama : defragment via non-overlapping moves * llama : ggml_graph based defrag implementation ggml-ci * llama : switch the loop order in build_defrag * llama : add comments --- examples/infill/infill.cpp | 4 +- examples/main/main.cpp | 10 +- examples/passkey/passkey.cpp | 25 +- examples/server/server.cpp | 8 +- llama.cpp | 869 ++++++++++++++++++++++++----------- llama.h | 34 +- 6 files changed, 646 insertions(+), 304 deletions(-) diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 92c67b7cff5c8..d4b8729dd0283 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -447,8 +447,8 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7555dffe441f0..34e84d0d42f87 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -548,8 +548,8 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; @@ -576,9 +576,9 @@ int main(int argc, char ** argv) { LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); - llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd); - llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); - llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); + llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); n_past -= bd; diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index e12a1cdf19a79..47de67a93047f 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -126,7 +126,7 @@ int main(int argc, char ** argv) { const int n_batch = ctx_params.n_batch; const int n_batch_grp = ctx_params.n_batch/n_grp; - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch); + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); // print the prompt token-by-token @@ -146,10 +146,11 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_cache_update (ctx); - n_past -= bd; + n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } llama_batch_clear(batch); @@ -179,10 +180,12 @@ int main(int argc, char ** argv) { LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_defrag (ctx); + llama_kv_cache_update (ctx); - n_past -= n_discard; + n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; llama_batch_clear(batch); @@ -208,10 +211,12 @@ int main(int argc, char ** argv) { if (n_discard > 0) { LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_defrag (ctx); + llama_kv_cache_update (ctx); - n_past -= n_discard; + n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c1eb61678c38a..8aadc95a9728f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1636,8 +1636,8 @@ struct llama_server_context {"n_system_tokens", system_tokens.size()}, {"n_cache_tokens", slot.cache_tokens.size()} }); - llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -1941,9 +1941,9 @@ struct llama_server_context LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); slot.n_past_se -= bd; diff --git a/llama.cpp b/llama.cpp index acd9be08a6e5e..3424b1999ebdd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1550,8 +1550,9 @@ static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; struct llama_hparams { - bool vocab_only; - bool rope_finetuned; + bool vocab_only; + bool rope_finetuned; + uint32_t n_vocab; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; @@ -1580,7 +1581,8 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; - uint32_t pooling_type = LLAMA_POOLING_TYPE_NONE; + enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; + enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -1707,11 +1709,20 @@ struct llama_kv_cell { bool has_seq_id(const llama_seq_id & id) const { return seq_id.find(id) != seq_id.end(); } + + bool is_empty() const { + return seq_id.empty(); + } + + bool is_same_seq(const llama_kv_cell & other) const { + return seq_id == other.seq_id; + } }; // ring-buffer of cached KV data struct llama_kv_cache { bool has_shift = false; + bool do_defrag = false; // Note: The value of head isn't only used to optimize searching // for a free KV slot. llama_decode_internal also uses it, so it @@ -1723,6 +1734,9 @@ struct llama_kv_cache { // computed before each graph build uint32_t n = 0; + ggml_type type_k = GGML_TYPE_F16; + ggml_type type_v = GGML_TYPE_F16; + std::vector cells; std::vector k_l; // per layer @@ -1958,8 +1972,8 @@ struct llama_context { static bool llama_kv_cache_init( struct llama_kv_cache & cache, const llama_model & model, - ggml_type ktype, - ggml_type vtype, + ggml_type type_k, + ggml_type type_v, uint32_t n_ctx, bool offload) { const struct llama_hparams & hparams = model.hparams; @@ -1974,6 +1988,9 @@ static bool llama_kv_cache_init( cache.size = n_ctx; cache.used = 0; + cache.type_k = type_k; + cache.type_v = type_v; + cache.cells.clear(); cache.cells.resize(n_ctx); @@ -2014,8 +2031,8 @@ static bool llama_kv_cache_init( for (int i = 0; i < (int) n_layer; i++) { struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); - ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx); - ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx); + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); @@ -2099,7 +2116,7 @@ static bool llama_kv_cache_find_slot( // find how many cells are currently in use static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { for (uint32_t i = cache.size - 1; i > 0; --i) { - if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) { + if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) { return i + 1; } } @@ -2135,7 +2152,7 @@ static void llama_kv_cache_seq_rm( } else { continue; } - if (cache.cells[i].seq_id.empty()) { + if (cache.cells[i].is_empty()) { // keep count of the number of used cells if (cache.cells[i].pos >= 0) cache.used--; @@ -2186,7 +2203,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id if (new_head != cache.size && new_head < cache.head) cache.head = new_head; } -static void llama_kv_cache_seq_shift( +static void llama_kv_cache_seq_add( struct llama_kv_cache & cache, llama_seq_id seq_id, llama_pos p0, @@ -2204,10 +2221,14 @@ static void llama_kv_cache_seq_shift( cache.cells[i].delta += delta; if (cache.cells[i].pos < 0) { - if (!cache.cells[i].seq_id.empty()) cache.used--; + if (!cache.cells[i].is_empty()) { + cache.used--; + } cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); - if (new_head == cache.size) new_head = i; + if (new_head == cache.size) { + new_head = i; + } } } } @@ -2239,6 +2260,22 @@ static void llama_kv_cache_seq_div( } } +static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) { + llama_pos result = 0; + + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id)) { + result = std::max(result, cache.cells[i].pos); + } + } + + return result; +} + +static void llama_kv_cache_defrag(struct llama_kv_cache & cache) { + cache.do_defrag = true; +} + // // model loading and saving // @@ -2310,7 +2347,7 @@ namespace GGUFMeta { } }; - struct ArrayInfo{ + struct ArrayInfo { const gguf_type gt; const size_t length; const void * data; @@ -2329,7 +2366,7 @@ namespace GGUFMeta { }; template - class GKV: public GKV_Base { + class GKV : public GKV_Base { GKV() = delete; public: @@ -2352,39 +2389,39 @@ namespace GGUFMeta { return "unknown"; } - static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) { - if (!override) { return false; } - if (override->tag == expected_type) { + static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { + if (!ovrd) { return false; } + if (ovrd->tag == expected_type) { LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", - __func__, override_type_to_str(override->tag), override->key); - switch (override->tag) { + __func__, override_type_to_str(ovrd->tag), ovrd->key); + switch (ovrd->tag) { case LLAMA_KV_OVERRIDE_TYPE_BOOL: { - LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false"); + LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false"); } break; case LLAMA_KV_OVERRIDE_TYPE_INT: { - LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value); + LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value); } break; case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { - LLAMA_LOG_INFO("%.6f\n", override->float_value); + LLAMA_LOG_INFO("%.6f\n", ovrd->float_value); } break; default: // Shouldn't be possible to end up here, but just in case... throw std::runtime_error( format("Unsupported attempt to override %s type for metadata key %s\n", - override_type_to_str(override->tag), override->key)); + override_type_to_str(ovrd->tag), ovrd->key)); } return true; } LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", - __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag)); + __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); return false; } template static typename std::enable_if::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, override)) { - target = override->bool_value; + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { + target = ovrd->bool_value; return true; } return false; @@ -2392,9 +2429,9 @@ namespace GGUFMeta { template static typename std::enable_if::value && std::is_integral::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, override)) { - target = override->int_value; + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { + target = ovrd->int_value; return true; } return false; @@ -2402,9 +2439,9 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, override)) { - target = override->float_value; + try_override(T & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { + target = ovrd->float_value; return true; } return false; @@ -2412,17 +2449,17 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override *override) { + try_override(T & target, const struct llama_model_kv_override * ovrd) { (void)target; - (void)override; - if (!override) { return false; } + (void)ovrd; + if (!ovrd) { return false; } // Currently, we should never end up here so it would be a bug if we do. throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n", - override ? override->key : "NULL")); + ovrd ? ovrd->key : "NULL")); } - static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) { - if (try_override(target, override)) { + static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + if (try_override(target, ovrd)) { return true; } if (k < 0) { return false; } @@ -2430,12 +2467,12 @@ namespace GGUFMeta { return true; } - static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) { - return set(ctx, gguf_find_key(ctx, key), target, override); + static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, gguf_find_key(ctx, key), target, ovrd); } - static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) { - return set(ctx, key.c_str(), target, override); + static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, key.c_str(), target, ovrd); } }; } @@ -2846,6 +2883,15 @@ struct llama_model_loader { } }; +template<> +bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) { + uint32_t tmp; + const bool found = get_key(kid, tmp, required); + result = (enum llama_pooling_type) tmp; + return found; +} + + // // load LLaMA models // @@ -2926,16 +2972,16 @@ static const char * llama_model_type_name(e_model type) { default: return "?B"; } } + static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ switch (type) { - case LLAMA_VOCAB_TYPE_SPM: return "SPM"; - case LLAMA_VOCAB_TYPE_BPE: return "BPE"; - case LLAMA_VOCAB_TYPE_WPM: return "WPM"; - default: return "unknown"; + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + default: return "unknown"; } } - static void llm_load_arch(llama_model_loader & ml, llama_model & model) { model.arch = ml.get_arch(); if (model.arch == LLM_ARCH_UNKNOWN) { @@ -3112,10 +3158,10 @@ static void llm_load_hparams( } break; case LLM_ARCH_BERT: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); switch (hparams.n_layer) { case 3: @@ -3133,10 +3179,10 @@ static void llm_load_hparams( } break; case LLM_ARCH_NOMIC_BERT: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); if (hparams.n_layer == 12 && hparams.n_embd == 768) { model.type = e_model::MODEL_137M; @@ -3275,6 +3321,8 @@ static void llm_load_hparams( if (hparams.f_max_alibi_bias > 0.0f) { hparams.need_kq_pos = true; } + + hparams.rope_type = llama_rope_type(&model); } // TODO: This should probably be in llama.h @@ -3577,6 +3625,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); + LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); + LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); @@ -4598,12 +4648,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam using llm_build_cb = std::function; -enum llm_rope_type { - LLM_ROPE, - LLM_ROPE_NEOX, - LLM_ROPE_GLM, -}; - enum llm_ffn_op_type { LLM_FFN_SILU, LLM_FFN_GELU, @@ -4649,55 +4693,6 @@ static struct ggml_tensor * llm_build_inp_embd( return inpL; } -// Persimmon: n_rot = n_embd_head_k/2 -// Other: n_rot = n_embd_head_k -static void llm_build_k_shift( - struct ggml_context * ctx, - const llama_hparams & hparams, - const llama_cparams & cparams, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * K_shift, - llm_rope_type type, - int64_t n_ctx, - float freq_base, - float freq_scale, - const llm_build_cb & cb) { - const int64_t n_layer = hparams.n_layer; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const int32_t n_rot = hparams.n_rot; - const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; - const float ext_factor = cparams.yarn_ext_factor; - const float attn_factor = cparams.yarn_attn_factor; - const float beta_fast = cparams.yarn_beta_fast; - const float beta_slow = cparams.yarn_beta_slow; - - int rope_type = 0; - - switch (type) { - case LLM_ROPE: rope_type = 0; break; - case LLM_ROPE_NEOX: rope_type = 2; break; - case LLM_ROPE_GLM: rope_type = 4; break; - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - // we rotate only the first n_rot dimensions - ggml_rope_custom_inplace(ctx, - ggml_view_3d(ctx, kv.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, - ggml_row_size(kv.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), - 0), - K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(graph, tmp); - } -} - static void llm_build_kv_store( struct ggml_context * ctx, const llama_hparams & hparams, @@ -5001,6 +4996,7 @@ struct llm_build_context { const int64_t n_embd; const int64_t n_layer; + const int64_t n_rot; const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_head; const int64_t n_head_kv; @@ -5025,8 +5021,8 @@ struct llm_build_context { const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_orig_ctx; - const bool do_rope_shift; - const uint32_t pooling_type; + const enum llama_pooling_type pooling_type; + const enum llama_rope_type rope_type; const llm_build_cb & cb; @@ -5048,6 +5044,7 @@ struct llm_build_context { kv_self (lctx.kv_self), n_embd (hparams.n_embd), n_layer (hparams.n_layer), + n_rot (hparams.n_rot), n_ctx (cparams.n_ctx), n_head (hparams.n_head), n_head_kv (hparams.n_head_kv), @@ -5069,8 +5066,8 @@ struct llm_build_context { n_kv (worst_case ? n_ctx : kv_self.n), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), - do_rope_shift (worst_case || kv_self.has_shift), - pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_TYPE_NONE), + pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE), + rope_type (hparams.rope_type), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -5093,6 +5090,74 @@ struct llm_build_context { } } + struct ggml_cgraph * build_k_shift() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + ggml_rope_custom_inplace(ctx0, + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_head_kv, n_ctx, + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + 0), + lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(tmp, "K_shifted", il); + ggml_build_forward_expand(gf, tmp); + } + + return gf; + } + + struct ggml_cgraph * build_defrag(const std::vector & ids) { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + for (int i = 0; i < n_kv; ++i) { + const int id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + int nm = 1; + + while (i + nm < n_kv && (int) ids[i + nm] == id + nm) { + nm++; + } + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, i)); + + ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, id)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } + + i += nm - 1; + } + + return gf; + } + struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -5114,11 +5179,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5154,14 +5214,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5302,11 +5362,6 @@ struct llm_build_context { struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); cb(KQ_pos, "KQ_pos", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5330,12 +5385,12 @@ struct llm_build_context { case MODEL_7B: Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); break; @@ -5420,11 +5475,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5463,13 +5513,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5639,10 +5689,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * residual = inpL; @@ -5700,7 +5746,7 @@ struct llm_build_context { // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( - ctx0, tmpq, hparams.n_rot, n_head, n_tokens, + ctx0, tmpq, n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, 0 @@ -5708,7 +5754,7 @@ struct llm_build_context { cb(qrot, "qrot", il); struct ggml_tensor * krot = ggml_view_3d( - ctx0, tmpk, hparams.n_rot, n_head, n_tokens, + ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, 0 @@ -5717,29 +5763,29 @@ struct llm_build_context { // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( - ctx0, tmpq, hparams.n_rot, n_head, n_tokens, + ctx0, tmpq, n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, - ggml_element_size(tmpq) * hparams.n_rot + ggml_element_size(tmpq) * n_rot ); cb(qpass, "qpass", il); struct ggml_tensor * kpass = ggml_view_3d( - ctx0, tmpk, hparams.n_rot, n_head, n_tokens, + ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, - ggml_element_size(tmpk) * hparams.n_rot + ggml_element_size(tmpk) * n_rot ); cb(kpass, "kpass", il); struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(qrotated, "qrotated", il); struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(krotated, "krotated", il); @@ -5991,14 +6037,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6287,11 +6333,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6328,14 +6369,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6410,11 +6451,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6444,13 +6480,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6524,11 +6560,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6564,14 +6595,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6645,11 +6676,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { attn_norm_output = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, @@ -6687,7 +6713,7 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); @@ -6698,7 +6724,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6767,11 +6793,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { // norm @@ -6795,14 +6816,14 @@ struct llm_build_context { cb(Vcur, "Vcur", il); Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos, - n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, + n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos, - n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, + ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, + n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -6972,11 +6993,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, @@ -7002,14 +7018,14 @@ struct llm_build_context { struct ggml_tensor * Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); struct ggml_tensor * Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7080,11 +7096,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7120,14 +7131,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7199,11 +7210,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7239,14 +7245,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7331,11 +7337,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7371,14 +7372,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7467,11 +7468,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { // norm @@ -7494,7 +7490,7 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, - n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); @@ -7503,7 +7499,7 @@ struct llm_build_context { Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, - n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -7556,6 +7552,40 @@ struct llm_build_context { } }; +static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { + llama_batch dummy; + dummy.n_tokens = 0; + + llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; + + struct llm_build_context llm(lctx, dummy, cb, false); + + llm.init(); + + struct ggml_cgraph * result = llm.build_defrag(ids); + + llm.free(); + + return result; +} + +static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { + llama_batch dummy; + dummy.n_tokens = 0; + + llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; + + struct llm_build_context llm(lctx, dummy, cb, false); + + llm.init(); + + struct ggml_cgraph * result = llm.build_k_shift(); + + llm.free(); + + return result; +} + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch, @@ -7675,6 +7705,20 @@ static struct ggml_cgraph * llama_build_graph( return result; } +static void llama_set_k_shift(llama_context & lctx) { + const auto & cparams = lctx.cparams; + + const int64_t n_ctx = cparams.n_ctx; + + assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); + + int32_t * data = (int32_t *) lctx.inp_K_shift->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } +} + static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { // // set input data @@ -7742,18 +7786,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (kv_self.has_shift) { - const int64_t n_ctx = cparams.n_ctx; - - assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); - - int32_t * data = (int32_t *) lctx.inp_K_shift->data; - - for (int i = 0; i < n_ctx; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } - } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = batch.n_tokens; @@ -7798,6 +7830,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } +static void llama_graph_compute( + llama_context & lctx, + ggml_cgraph * gf, + int n_threads) { +#ifdef GGML_USE_MPI + const int64_t n_layer = lctx.model.hparams.n_layer; + ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); +#endif + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(lctx.backend_metal)) { + ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); + } +#endif + + if (lctx.backend_cpu != nullptr) { + ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + } + + ggml_backend_sched_graph_compute(lctx.sched, gf); + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); + +#ifdef GGML_USE_MPI + ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); +#endif +} + // decode a batch of tokens by evaluating the transformer // // - lctx: llama context @@ -7893,14 +7953,17 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + llama_kv_cache_update(&lctx); + ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, batch, false); // the output is always the last tensor in the graph - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + if (strcmp(res->name, "result_output") == 0) { // the embeddings could be the second to last tensor, or the third to last tensor if (strcmp(embeddings->name, "result_norm") != 0) { @@ -7927,40 +7990,12 @@ static int llama_decode_internal( n_threads = std::min(4, n_threads); } -#ifdef GGML_USE_MPI - const int64_t n_layer = hparams.n_layer; - ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); -#endif - -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(lctx.backend_metal)) { - ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); - } -#endif - - if (lctx.backend_cpu != nullptr) { - ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); - } - llama_set_inputs(lctx, batch); - ggml_backend_sched_graph_compute(lctx.sched, gf); - - // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); - -#ifdef GGML_USE_MPI - ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); -#endif + llama_graph_compute(lctx, gf, n_threads); // update the kv ring buffer { - if (kv_self.has_shift) { - kv_self.has_shift = false; - for (uint32_t i = 0; i < kv_self.size; ++i) { - kv_self.cells[i].delta = 0; - } - } - kv_self.head += n_tokens; // Ensure kv cache head points to a valid index. @@ -8056,6 +8091,221 @@ static int llama_decode_internal( return 0; } +// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache +static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { + auto & kv_self = lctx.kv_self; + + const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); + const uint32_t n_used = kv_self.used; + + assert(n_used <= n_kv); + + const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + // determine the size of the hole + uint32_t nh = 1; + while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { + nh++; + } + + // starting from the end, find nh non-empty cells + uint32_t nf = 0; + uint32_t is = n_kv - 1; + for (; is > i0; --is) { + const auto & cell1 = kv_self.cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + // go back and move the nf cells to the hole + for (uint32_t i1 = is; i1 < n_kv; ++i1) { + const auto & cell1 = kv_self.cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + kv_self.cells[i0 + nf] = cell1; + + n_moves++; + nf++; + } + + LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, n_kv, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return; + } + + LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + + kv_self.head = n_used; + kv_self.used = n_used; + + // zero the rest of the cells + for (uint32_t i = n_used; i < n_kv; ++i) { + kv_self.cells[i] = llama_kv_cell(); + } + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const auto & hparams = lctx.model.hparams; + + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = kv_self.size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + // ggml_graph defrag + + ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); +#endif + + const int64_t t_end = ggml_time_us(); + + LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); +} + +static void llama_kv_cache_update_internal(struct llama_context & lctx) { + // apply K-shift if needed + if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + llama_set_k_shift(lctx); + + { + ggml_cgraph * gf = llama_build_graph_k_shift(lctx); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + } + + { + auto & kv_self = lctx.kv_self; + + kv_self.has_shift = false; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].delta = 0; + } + } + } + + // defragment the KV cache if needed + if (lctx.kv_self.do_defrag) { + llama_kv_cache_defrag_internal(lctx); + + lctx.kv_self.do_defrag = false; + } +} + // // tokenizer // @@ -11671,8 +11921,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_cpu); - if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, - cparams.n_ctx, cparams.offload_kqv)) { + if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -11820,6 +12069,49 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { return model->vocab.type; } +enum llama_rope_type llama_rope_type(const struct llama_model * model) { + switch (model->arch) { + // these models do not use RoPE + case LLM_ARCH_GPT2: + case LLM_ARCH_GPTJ: + case LLM_ARCH_GPTNEOX: + case LLM_ARCH_MPT: + case LLM_ARCH_REFACT: + case LLM_ARCH_BLOOM: + return LLAMA_ROPE_TYPE_NONE; + + // use what we call a normal RoPE, operating on pairs of consecutive head values + case LLM_ARCH_LLAMA: + case LLM_ARCH_BAICHUAN: + case LLM_ARCH_STARCODER: + case LLM_ARCH_PLAMO: + case LLM_ARCH_CODESHELL: + case LLM_ARCH_ORION: + case LLM_ARCH_INTERNLM2: + case LLM_ARCH_MINICPM: + case LLM_ARCH_GEMMA: + return LLAMA_ROPE_TYPE_NORM; + + // the pairs of head values are offset by n_rot/2 + case LLM_ARCH_FALCON: + case LLM_ARCH_PERSIMMON: + case LLM_ARCH_BERT: + case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_STABLELM: + case LLM_ARCH_QWEN: + case LLM_ARCH_QWEN2: + case LLM_ARCH_PHI2: + return LLAMA_ROPE_TYPE_NEOX; + + // all model arches should be listed explicitly here + case LLM_ARCH_UNKNOWN: + GGML_ASSERT(false && "unknown architecture"); + break; + } + + return LLAMA_ROPE_TYPE_NONE; +} + int32_t llama_n_vocab(const struct llama_model * model) { return model->vocab.id_to_token.size(); } @@ -12062,12 +12354,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { llama_kv_cache_seq_keep(ctx->kv_self, seq_id); } -void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { +void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { if (delta == 0) { return; } - llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta); + llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta); } void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { @@ -12078,6 +12370,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); } +llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id); +} + +void llama_kv_cache_defrag(struct llama_context * ctx) { + llama_kv_cache_defrag(ctx->kv_self); +} + +void llama_kv_cache_update(struct llama_context * ctx) { + llama_kv_cache_update_internal(*ctx); +} + + // Returns the *maximum* size of the state size_t llama_get_state_size(const struct llama_context * ctx) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. @@ -12204,10 +12509,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto & hparams = ctx->model.hparams; const auto & cparams = ctx->cparams; - const auto n_layer = hparams.n_layer; - const auto n_embd_k_gqa = hparams.n_embd_k_gqa(); - const auto n_embd_v_gqa = hparams.n_embd_v_gqa(); - const auto n_ctx = cparams.n_ctx; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_ctx = cparams.n_ctx; const size_t kv_buf_size = kv_self.total_size(); const uint32_t kv_head = kv_self.head; @@ -12222,14 +12527,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat if (kv_buf_size) { std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + tmp_buf.resize(k_size); ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); // v is not contiguous, copy row by row - size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + tmp_buf.resize(v_row_size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size()); @@ -12316,10 +12623,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const auto & hparams = ctx->model.hparams; const auto & cparams = ctx->cparams; - const int n_layer = hparams.n_layer; - const int n_embd_k_gqa = hparams.n_embd_k_gqa(); - const int n_embd_v_gqa = hparams.n_embd_v_gqa(); - const int n_ctx = cparams.n_ctx; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_ctx = cparams.n_ctx; size_t kv_buf_size; uint32_t kv_head; @@ -12335,13 +12642,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { GGML_ASSERT(kv_self.total_size() == kv_buf_size); for (int il = 0; il < (int) n_layer; ++il) { - size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); inp += k_size; // v is not contiguous, copy row by row - size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); inp += v_row_size; diff --git a/llama.h b/llama.h index 947284ea2f535..ff131996d9a38 100644 --- a/llama.h +++ b/llama.h @@ -64,6 +64,15 @@ extern "C" { LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece }; + // note: these values should be synchronized with ggml_rope + // TODO: maybe move this enum to ggml.h (ggml_rope_type) + enum llama_rope_type { + LLAMA_ROPE_TYPE_NONE = -1, + LLAMA_ROPE_TYPE_NORM = 0, + LLAMA_ROPE_TYPE_NEOX = 2, + LLAMA_ROPE_TYPE_GLM = 4, + }; + enum llama_token_type { LLAMA_TOKEN_TYPE_UNDEFINED = 0, LLAMA_TOKEN_TYPE_NORMAL = 1, @@ -360,6 +369,7 @@ extern "C" { LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); + LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); @@ -514,10 +524,12 @@ extern "C" { llama_seq_id seq_id); // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_shift( + LLAMA_API void llama_kv_cache_seq_add( struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, @@ -525,7 +537,9 @@ extern "C" { llama_pos delta); // Integer division of the positions by factor of `d > 1` - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_div( @@ -535,6 +549,20 @@ extern "C" { llama_pos p1, int d); + // Returns the largest position present in the KV cache for the specified sequence + LLAMA_API llama_pos llama_kv_cache_seq_pos_max( + struct llama_context * ctx, + llama_seq_id seq_id); + + // Defragment the KV cache + // This will be applied: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() + LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); + + // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) + LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); + // // State / sessions // From 8b350356b28f782deab63d8b0e9ae103ceb25fcd Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 25 Feb 2024 21:46:29 +0100 Subject: [PATCH 49/51] server: docs - refresh and tease a little bit more the http server (#5718) * server: docs - refresh and tease a little bit more the http server * Rephrase README.md server doc Co-authored-by: Georgi Gerganov * Update examples/server/README.md Co-authored-by: Georgi Gerganov * Update examples/server/README.md Co-authored-by: Georgi Gerganov * Update README.md --------- Co-authored-by: Georgi Gerganov --- README.md | 3 +++ examples/server/README.md | 18 +++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d61f9171b1b62..d0af5d0b9b077 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,9 @@ Typically finetunes of the base models below are supported as well. - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) +**HTTP server** + +[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. **Bindings:** diff --git a/examples/server/README.md b/examples/server/README.md index cb3fd6054095b..0e9bd7fd404ba 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -1,8 +1,20 @@ -# llama.cpp/example/server +# LLaMA.cpp HTTP Server -This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp. +Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**. -Command line options: +Set of LLM REST APIs and a simple web front end to interact with llama.cpp. + +**Features:** + * LLM inference of F16 and quantum models on GPU and CPU + * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes + * Parallel decoding with multi-user support + * Continuous batching + * Multimodal (wip) + * Monitoring endpoints + +The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). + +**Command line options:** - `--threads N`, `-t N`: Set the number of threads to use during generation. - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. From e3965cf35aac00d4e24998c8a3d0093ae1d98bd3 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 25 Feb 2024 22:48:33 +0100 Subject: [PATCH 50/51] server: tests - slow inference causes timeout on the CI (#5715) * server: tests - longer inference timeout for CI --- common/sampling.cpp | 2 +- examples/server/tests/features/steps/steps.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index de4331a1182d6..e67096bea6932 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -266,7 +266,7 @@ static llama_token llama_sampling_sample_impl( // } //} - LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str()); + //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str()); } } diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 8e4babf204f8a..ad87fcb820aa8 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -699,6 +699,8 @@ async def wait_for_health_status(context, if context.debug: print(f"Starting checking for health for expected_health_status={expected_health_status}") timeout = 3 # seconds + if expected_health_status == 'ok': + timeout = 10 # CI slow inference interval = 0.5 counter = 0 async with aiohttp.ClientSession() as session: @@ -736,7 +738,7 @@ async def wait_for_health_status(context, if n_completions > 0: return - assert False, 'timeout exceeded' + assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}' def assert_embeddings(embeddings): From c39373398803c669056304090050fe3f44b41bf9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 25 Feb 2024 00:17:11 +0000 Subject: [PATCH 51/51] flake.lock: Update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/5863c27340ba4de8f83e7e3c023b9599c3cb3c80' (2024-02-16) → 'github:NixOS/nixpkgs/cbc4211f0afffe6dfd2478a62615dd5175a13f9a' (2024-02-23) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 47d6448b5ceb9..9f659ba8f4cef 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1708118438, - "narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=", + "lastModified": 1708655239, + "narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80", + "rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a", "type": "github" }, "original": {