From 53e0215053bf6557edfe86630a3bca4035e69543 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Wed, 20 Nov 2024 14:42:03 +0500
Subject: [PATCH 01/10] Initial "prefix+suffix" chat template

* llama, common, server
* main handles prefix and suffix separately, only adjust for example display
---
 common/arg.cpp             |  8 ++++----
 common/common.cpp          | 22 +++++++++++++++-------
 common/common.h            |  6 +++++-
 examples/main/main.cpp     |  2 +-
 examples/server/server.cpp | 13 +++++++++----
 examples/server/utils.hpp  | 10 ++++++----
 include/llama.h            |  2 ++
 src/llama.cpp              | 18 +++++++++++++++++-
 8 files changed, 59 insertions(+), 22 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 4115b2f7511d3..97fedfcc36fb4 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -850,17 +850,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "string to prefix user inputs with (default: empty)",
         [](common_params & params, const std::string & value) {
             params.input_prefix = value;
-            params.enable_chat_template = false;
+            // params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
         [](common_params & params, const std::string & value) {
             params.input_suffix = value;
-            params.enable_chat_template = false;
+            // params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",
diff --git a/common/common.cpp b/common/common.cpp
index d314523db4c62..d7d25470b810a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1559,12 +1559,14 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 
 bool common_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
+    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), nullptr, nullptr, chat, 1, true, nullptr, 0);
     return res >= 0;
 }
 
 std::string common_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
+        const std::string & prefix,
+        const std::string & suffix,
         const std::vector<common_chat_msg> & msgs,
         bool add_ass) {
     int alloc_size = 0;
@@ -1576,10 +1578,12 @@ std::string common_chat_apply_template(const struct llama_model * model,
     }
 
     const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+    const char * ptr_prefix = prefix.empty() ? nullptr : prefix.c_str();
+    const char * ptr_suffix = suffix.empty() ? nullptr : suffix.c_str();
     std::vector<char> buf(alloc_size);
 
     // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    int32_t res = llama_chat_apply_template(model, ptr_tmpl, ptr_prefix, ptr_suffix, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
 
     // error: chat template is not supported
     if (res < 0) {
@@ -1589,7 +1593,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
             throw std::runtime_error("this custom template is not supported");
         } else {
             // If the built-in template is not supported, we default to chatml
-            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+            res = llama_chat_apply_template(nullptr, "chatml", nullptr, nullptr, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
             fallback = true;
         }
     }
@@ -1600,6 +1604,8 @@ std::string common_chat_apply_template(const struct llama_model * model,
         res = llama_chat_apply_template(
             fallback ? nullptr : model,
             fallback ? "chatml" : ptr_tmpl,
+            fallback ? nullptr : ptr_prefix,
+            fallback ? nullptr : ptr_suffix,
             chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     }
 
@@ -1613,7 +1619,9 @@ std::string common_chat_format_single(const struct llama_model * model,
         const common_chat_msg & new_msg,
         bool add_ass) {
     std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
+    const std::string prefix;
+    const std::string suffix;
+    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, prefix, suffix, past_msg, false);
     std::vector<common_chat_msg> chat_new(past_msg);
     // if the past_msg ends with a newline, we must preserve it in the formatted version
     if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -1621,21 +1629,21 @@ std::string common_chat_format_single(const struct llama_model * model,
     };
     // format chat with new_msg
     chat_new.push_back(new_msg);
-    auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
+    auto fmt_new_msg = common_chat_apply_template(model, tmpl, prefix, suffix, chat_new, add_ass);
     // get the diff part
     ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
     return ss.str();
 }
 
 std::string common_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl) {
+        const std::string & tmpl, const std::string & prefix, const std::string & suffix) {
     std::vector<common_chat_msg> msgs = {
         {"system",    "You are a helpful assistant"},
         {"user",      "Hello"},
         {"assistant", "Hi there"},
         {"user",      "How are you?"},
     };
-    return common_chat_apply_template(model, tmpl, msgs, true);
+    return common_chat_apply_template(model, tmpl, prefix, suffix, msgs, true);
 }
 
 //
diff --git a/common/common.h b/common/common.h
index 7977cc7a99a78..ad6af3613e34a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -523,6 +523,8 @@ bool common_chat_verify_template(const std::string & tmpl);
 // If the custom "tmpl" is not supported, we throw an error
 std::string common_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
+        const std::string & prefix,
+        const std::string & suffix,
         const std::vector<common_chat_msg> & chat,
         bool add_ass);
 
@@ -535,7 +537,9 @@ std::string common_chat_format_single(const struct llama_model * model,
 
 // Returns an example of formatted chat
 std::string common_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl);
+        const std::string & tmpl,
+        const std::string & prefix,
+        const std::string & suffix);
 
 //
 // KV cache utils
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7c4ce4be2abae..fe5dc6515b85b 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -202,7 +202,7 @@ int main(int argc, char ** argv) {
     // print chat template example in conversation mode
     if (params.conversation) {
         if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template, params.input_suffix, params.input_suffix).c_str());
         } else {
             LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
         }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b8e003be9730e..8777882dbb395 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -667,7 +667,7 @@ struct server_context {
         if (res >= 0) {
             llama_chat_message chat[] = {{"user", "test"}};
             std::string tmpl = std::string(model_template.data(), model_template.size());
-            int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
+            int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), nullptr, nullptr, chat, 1, true, nullptr, 0);
             return chat_res > 0;
         }
         return false;
@@ -2829,7 +2829,7 @@ int main(int argc, char ** argv) {
             return;
         }
 
-        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
+        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template, params.input_prefix, params.input_suffix);
 
         std::vector<server_task> tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION);
         ctx_server.queue_results.add_waiting_tasks(tasks);
@@ -3220,14 +3220,19 @@ int main(int argc, char ** argv) {
 
     // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
     if (params.chat_template.empty()) {
-        if (!ctx_server.validate_model_chat_template()) {
+        if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
+            LOG_WRN("%s: Prefix and suffix are used instead of a chat template. This may cause the model to output suboptimal responses\n", __func__);
+            params.chat_template = "custom";
+        } else if (!ctx_server.validate_model_chat_template()) {
             LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
             params.chat_template = "chatml";
         }
+    } else if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
+        LOG_WRN("%s: Prefix and suffix are not used because a chat template is defined.\n", __func__); 
     }
 
     // print sample chat example to make it clear which template is used
-    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
+    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template, params.input_prefix, params.input_suffix).c_str());
 
     ctx_server.queue_tasks.on_new_task(std::bind(
                 &server_context::process_single_task, &ctx_server, std::placeholders::_1));
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index c47ed3e47a76d..09f3a0c852513 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -300,7 +300,7 @@ static llama_tokens format_infill(
 }
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::string & prefix, const std::string & suffix, const std::vector<json> & messages) {
     std::vector<common_chat_msg> chat;
 
     for (size_t i = 0; i < messages.size(); ++i) {
@@ -328,7 +328,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
         chat.push_back({role, content});
     }
 
-    const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
+    const auto formatted_chat = common_chat_apply_template(model, tmpl, prefix, suffix, chat, true);
     LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
 
     return formatted_chat;
@@ -597,13 +597,15 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
 static json oaicompat_completion_params_parse(
     const struct llama_model * model,
     const json & body, /* openai api json semantics */
-    const std::string & chat_template) {
+    const std::string & chat_template,
+    const std::string & input_prefix,
+    const std::string & input_suffix) {
     json llama_params;
 
     llama_params["__oaicompat"] = true;
 
     // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
+    llama_params["prompt"] = format_chat(model, chat_template, input_prefix, input_suffix, body.at("messages"));
 
     // Handle "stop" field
     if (body.contains("stop") && body.at("stop").is_string()) {
diff --git a/include/llama.h b/include/llama.h
index 90791d5f5ea12..0483b527e0139 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -981,6 +981,8 @@ extern "C" {
     LLAMA_API int32_t llama_chat_apply_template(
               const struct llama_model * model,
                             const char * tmpl,
+                            const char * prefix,
+                            const char * suffix,
        const struct llama_chat_message * chat,
                                 size_t   n_msg,
                                   bool   add_ass,
diff --git a/src/llama.cpp b/src/llama.cpp
index c51b36e66042e..789c7578da169 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21820,6 +21820,8 @@ int32_t llama_detokenize(
 // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
 static int32_t llama_chat_apply_template_internal(
     const std::string & tmpl,
+    const std::string & prefix,
+    const std::string & suffix,
     const std::vector<const llama_chat_message *> & chat,
     std::string & dest, bool add_ass) {
     // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
@@ -22096,6 +22098,16 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|start_of_role|>assistant<|end_of_role|>\n";
         }
+    } else if (tmpl == "custom") {
+        // a custom template using only prefix and suffix
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "user") {
+                ss << prefix << message->content << suffix;
+            } else {
+                ss << message->content;
+            }
+        }
     } else {
         // template not supported
         return -1;
@@ -22107,6 +22119,8 @@ static int32_t llama_chat_apply_template_internal(
 int32_t llama_chat_apply_template(
                 const struct llama_model * model,
                               const char * tmpl,
+                              const char * prefix,
+                              const char * suffix,
          const struct llama_chat_message * chat,
                                   size_t   n_msg,
                                     bool   add_ass,
@@ -22135,7 +22149,9 @@ int32_t llama_chat_apply_template(
     }
 
     std::string formatted_chat;
-    int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
+    std::string prefix_chat = (prefix == nullptr ? "" : prefix);
+    std::string suffix_chat = (suffix == nullptr ? "" : suffix);
+    int32_t res = llama_chat_apply_template_internal(curr_tmpl, prefix_chat, suffix_chat, chat_vec, formatted_chat, add_ass);
     if (res < 0) {
         return res;
     }

From 9b58edf2d8af6c50e7a0c23034c29cdcc6422ced Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Wed, 20 Nov 2024 15:40:59 +0500
Subject: [PATCH 02/10] Fix trailing whitespace, reverted
 `enable_chat_template` in arg

* `enable_chat_template` is no used by `server`
---
 common/arg.cpp             | 4 ++--
 examples/server/server.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 97fedfcc36fb4..bb005689c1095 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -850,7 +850,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "string to prefix user inputs with (default: empty)",
         [](common_params & params, const std::string & value) {
             params.input_prefix = value;
-            // params.enable_chat_template = false;
+            params.enable_chat_template = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
@@ -858,7 +858,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "string to suffix after user inputs with (default: empty)",
         [](common_params & params, const std::string & value) {
             params.input_suffix = value;
-            // params.enable_chat_template = false;
+            params.enable_chat_template = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8777882dbb395..1d68cd6440b60 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3228,7 +3228,7 @@ int main(int argc, char ** argv) {
             params.chat_template = "chatml";
         }
     } else if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
-        LOG_WRN("%s: Prefix and suffix are not used because a chat template is defined.\n", __func__); 
+        LOG_WRN("%s: Prefix and suffix are not used because a chat template is defined.\n", __func__);
     }
 
     // print sample chat example to make it clear which template is used

From b3e343eae707fe9486916fb3d619d49cd6a9d12b Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Wed, 20 Nov 2024 16:04:55 +0500
Subject: [PATCH 03/10] Fix for `simple-chat`

---
 examples/simple-chat/simple-chat.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index 5f9973163732d..d85a385ce28ce 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -158,10 +158,10 @@ int main(int argc, char ** argv) {
 
         // add the user input to the message list and format it
         messages.push_back({"user", strdup(user.c_str())});
-        int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+        int new_len = llama_chat_apply_template(model, nullptr, nullptr, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
         if (new_len > (int)formatted.size()) {
             formatted.resize(new_len);
-            new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+            new_len = llama_chat_apply_template(model, nullptr, nullptr, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
         }
         if (new_len < 0) {
             fprintf(stderr, "failed to apply the chat template\n");
@@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
 
         // add the response to the messages
         messages.push_back({"assistant", strdup(response.c_str())});
-        prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
+        prev_len = llama_chat_apply_template(model, nullptr, nullptr, nullptr, messages.data(), messages.size(), false, nullptr, 0);
         if (prev_len < 0) {
             fprintf(stderr, "failed to apply the chat template\n");
             return 1;

From fc050381ef62102f277bb20657a53c691281b047 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Wed, 20 Nov 2024 16:12:25 +0500
Subject: [PATCH 04/10] Fixed `test-chat-template`

---
 tests/test-chat-template.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 03e897e66dca4..51121533ca2f2 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -118,7 +118,7 @@ int main(void) {
     int32_t res;
 
     // test invalid chat template
-    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
+    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", nullptr, nullptr, conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
     assert(res < 0);
 
     for (size_t i = 0; i < templates.size(); i++) {
@@ -128,6 +128,8 @@ int main(void) {
         res = llama_chat_apply_template(
             nullptr,
             custom_template.c_str(),
+            nullptr,
+            nullptr,
             conversation,
             message_count,
             true,

From ec6212ee64ff84a59cdb8d88998d32e02ca4818c Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Wed, 20 Nov 2024 17:06:56 +0500
Subject: [PATCH 05/10] Reverted to a simple solution withing server only.

---
 common/common.cpp                    | 22 +++++++---------------
 common/common.h                      |  6 +-----
 examples/main/main.cpp               |  2 +-
 examples/server/server.cpp           |  8 ++++----
 examples/server/utils.hpp            | 11 +++++++++--
 examples/simple-chat/simple-chat.cpp |  6 +++---
 include/llama.h                      |  2 --
 src/llama.cpp                        | 18 +-----------------
 tests/test-chat-template.cpp         |  4 +---
 9 files changed, 27 insertions(+), 52 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index d7d25470b810a..d314523db4c62 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1559,14 +1559,12 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 
 bool common_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), nullptr, nullptr, chat, 1, true, nullptr, 0);
+    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
     return res >= 0;
 }
 
 std::string common_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
-        const std::string & prefix,
-        const std::string & suffix,
         const std::vector<common_chat_msg> & msgs,
         bool add_ass) {
     int alloc_size = 0;
@@ -1578,12 +1576,10 @@ std::string common_chat_apply_template(const struct llama_model * model,
     }
 
     const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    const char * ptr_prefix = prefix.empty() ? nullptr : prefix.c_str();
-    const char * ptr_suffix = suffix.empty() ? nullptr : suffix.c_str();
     std::vector<char> buf(alloc_size);
 
     // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, ptr_prefix, ptr_suffix, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
 
     // error: chat template is not supported
     if (res < 0) {
@@ -1593,7 +1589,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
             throw std::runtime_error("this custom template is not supported");
         } else {
             // If the built-in template is not supported, we default to chatml
-            res = llama_chat_apply_template(nullptr, "chatml", nullptr, nullptr, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
             fallback = true;
         }
     }
@@ -1604,8 +1600,6 @@ std::string common_chat_apply_template(const struct llama_model * model,
         res = llama_chat_apply_template(
             fallback ? nullptr : model,
             fallback ? "chatml" : ptr_tmpl,
-            fallback ? nullptr : ptr_prefix,
-            fallback ? nullptr : ptr_suffix,
             chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     }
 
@@ -1619,9 +1613,7 @@ std::string common_chat_format_single(const struct llama_model * model,
         const common_chat_msg & new_msg,
         bool add_ass) {
     std::ostringstream ss;
-    const std::string prefix;
-    const std::string suffix;
-    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, prefix, suffix, past_msg, false);
+    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
     std::vector<common_chat_msg> chat_new(past_msg);
     // if the past_msg ends with a newline, we must preserve it in the formatted version
     if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -1629,21 +1621,21 @@ std::string common_chat_format_single(const struct llama_model * model,
     };
     // format chat with new_msg
     chat_new.push_back(new_msg);
-    auto fmt_new_msg = common_chat_apply_template(model, tmpl, prefix, suffix, chat_new, add_ass);
+    auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
     // get the diff part
     ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
     return ss.str();
 }
 
 std::string common_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl, const std::string & prefix, const std::string & suffix) {
+        const std::string & tmpl) {
     std::vector<common_chat_msg> msgs = {
         {"system",    "You are a helpful assistant"},
         {"user",      "Hello"},
         {"assistant", "Hi there"},
         {"user",      "How are you?"},
     };
-    return common_chat_apply_template(model, tmpl, prefix, suffix, msgs, true);
+    return common_chat_apply_template(model, tmpl, msgs, true);
 }
 
 //
diff --git a/common/common.h b/common/common.h
index ad6af3613e34a..7977cc7a99a78 100644
--- a/common/common.h
+++ b/common/common.h
@@ -523,8 +523,6 @@ bool common_chat_verify_template(const std::string & tmpl);
 // If the custom "tmpl" is not supported, we throw an error
 std::string common_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
-        const std::string & prefix,
-        const std::string & suffix,
         const std::vector<common_chat_msg> & chat,
         bool add_ass);
 
@@ -537,9 +535,7 @@ std::string common_chat_format_single(const struct llama_model * model,
 
 // Returns an example of formatted chat
 std::string common_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl,
-        const std::string & prefix,
-        const std::string & suffix);
+        const std::string & tmpl);
 
 //
 // KV cache utils
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index fe5dc6515b85b..7c4ce4be2abae 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -202,7 +202,7 @@ int main(int argc, char ** argv) {
     // print chat template example in conversation mode
     if (params.conversation) {
         if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template, params.input_suffix, params.input_suffix).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
         } else {
             LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
         }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1d68cd6440b60..c0f8c7e736e65 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -667,7 +667,7 @@ struct server_context {
         if (res >= 0) {
             llama_chat_message chat[] = {{"user", "test"}};
             std::string tmpl = std::string(model_template.data(), model_template.size());
-            int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), nullptr, nullptr, chat, 1, true, nullptr, 0);
+            int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
             return chat_res > 0;
         }
         return false;
@@ -3229,11 +3229,11 @@ int main(int argc, char ** argv) {
         }
     } else if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
         LOG_WRN("%s: Prefix and suffix are not used because a chat template is defined.\n", __func__);
+    } else {
+        // print sample chat example to make it clear which template is used
+        LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
     }
 
-    // print sample chat example to make it clear which template is used
-    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template, params.input_prefix, params.input_suffix).c_str());
-
     ctx_server.queue_tasks.on_new_task(std::bind(
                 &server_context::process_single_task, &ctx_server, std::placeholders::_1));
 
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 09f3a0c852513..d0d9775cfbe87 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -302,6 +302,7 @@ static llama_tokens format_infill(
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::string & prefix, const std::string & suffix, const std::vector<json> & messages) {
     std::vector<common_chat_msg> chat;
+    std::string formatted_chat;
 
     for (size_t i = 0; i < messages.size(); ++i) {
         const auto & curr_msg = messages[i];
@@ -325,10 +326,16 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
             throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
         }
 
-        chat.push_back({role, content});
+        if (tmpl == "custom") {
+            // simple format using prefix and suffix
+            if (role == "user") formatted_chat += prefix + content + suffix;
+            else formatted_chat += content;
+        } else {
+            chat.push_back({role, content}); 
+        }
     }
 
-    const auto formatted_chat = common_chat_apply_template(model, tmpl, prefix, suffix, chat, true);
+    if (tmpl != "custom") formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
     LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
 
     return formatted_chat;
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index d85a385ce28ce..5f9973163732d 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -158,10 +158,10 @@ int main(int argc, char ** argv) {
 
         // add the user input to the message list and format it
         messages.push_back({"user", strdup(user.c_str())});
-        int new_len = llama_chat_apply_template(model, nullptr, nullptr, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+        int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
         if (new_len > (int)formatted.size()) {
             formatted.resize(new_len);
-            new_len = llama_chat_apply_template(model, nullptr, nullptr, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+            new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
         }
         if (new_len < 0) {
             fprintf(stderr, "failed to apply the chat template\n");
@@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
 
         // add the response to the messages
         messages.push_back({"assistant", strdup(response.c_str())});
-        prev_len = llama_chat_apply_template(model, nullptr, nullptr, nullptr, messages.data(), messages.size(), false, nullptr, 0);
+        prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
         if (prev_len < 0) {
             fprintf(stderr, "failed to apply the chat template\n");
             return 1;
diff --git a/include/llama.h b/include/llama.h
index 0483b527e0139..90791d5f5ea12 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -981,8 +981,6 @@ extern "C" {
     LLAMA_API int32_t llama_chat_apply_template(
               const struct llama_model * model,
                             const char * tmpl,
-                            const char * prefix,
-                            const char * suffix,
        const struct llama_chat_message * chat,
                                 size_t   n_msg,
                                   bool   add_ass,
diff --git a/src/llama.cpp b/src/llama.cpp
index 789c7578da169..c51b36e66042e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21820,8 +21820,6 @@ int32_t llama_detokenize(
 // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
 static int32_t llama_chat_apply_template_internal(
     const std::string & tmpl,
-    const std::string & prefix,
-    const std::string & suffix,
     const std::vector<const llama_chat_message *> & chat,
     std::string & dest, bool add_ass) {
     // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
@@ -22098,16 +22096,6 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|start_of_role|>assistant<|end_of_role|>\n";
         }
-    } else if (tmpl == "custom") {
-        // a custom template using only prefix and suffix
-        for (auto message : chat) {
-            std::string role(message->role);
-            if (role == "user") {
-                ss << prefix << message->content << suffix;
-            } else {
-                ss << message->content;
-            }
-        }
     } else {
         // template not supported
         return -1;
@@ -22119,8 +22107,6 @@ static int32_t llama_chat_apply_template_internal(
 int32_t llama_chat_apply_template(
                 const struct llama_model * model,
                               const char * tmpl,
-                              const char * prefix,
-                              const char * suffix,
          const struct llama_chat_message * chat,
                                   size_t   n_msg,
                                     bool   add_ass,
@@ -22149,9 +22135,7 @@ int32_t llama_chat_apply_template(
     }
 
     std::string formatted_chat;
-    std::string prefix_chat = (prefix == nullptr ? "" : prefix);
-    std::string suffix_chat = (suffix == nullptr ? "" : suffix);
-    int32_t res = llama_chat_apply_template_internal(curr_tmpl, prefix_chat, suffix_chat, chat_vec, formatted_chat, add_ass);
+    int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
     if (res < 0) {
         return res;
     }
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 51121533ca2f2..03e897e66dca4 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -118,7 +118,7 @@ int main(void) {
     int32_t res;
 
     // test invalid chat template
-    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", nullptr, nullptr, conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
+    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
     assert(res < 0);
 
     for (size_t i = 0; i < templates.size(); i++) {
@@ -128,8 +128,6 @@ int main(void) {
         res = llama_chat_apply_template(
             nullptr,
             custom_template.c_str(),
-            nullptr,
-            nullptr,
             conversation,
             message_count,
             true,

From a0e27c1cd00908b6f6b6aeb767fbc36edd68fd15 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Wed, 20 Nov 2024 22:51:26 +0500
Subject: [PATCH 06/10] Preliminary work for UI and logging

---
 examples/server/public/index.html | 48 +++++++++++++++++++++++++++++++
 examples/server/server.cpp        | 18 ++++++++----
 examples/server/utils.hpp         | 28 ++++++++++++++++--
 3 files changed, 86 insertions(+), 8 deletions(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 6216c08410a28..d65c540cfc682 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -119,6 +119,36 @@ <h2 class="font-bold mb-4 ml-4">Conversations</h2>
               </li>
             </ul>
           </div>
+
+          <!-- Templates -->
+          <div class="dropdown dropdown-end dropdown-bottom">
+            <div tabindex="0" role="button" class="btn m-1">
+              Templates
+              <svg width="12px" height="12px" class="inline-block h-2 w-2 fill-current opacity-60" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2048 2048">
+                <path d="M1799 349l242 241-1017 1017L7 590l242-241 775 775 775-775z"></path>
+              </svg>
+            </div>
+            <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
+              <li>
+                <button
+                  class="btn btn-sm btn-block w-full btn-ghost justify-start"
+                  :class="{ 'btn-active': config.chat_template === 'chatml' }"
+                  @click="config.chat_template = 'chatml'">
+                  auto
+                </button>
+              </li>
+              <li v-for="tmpl in templates">
+                <input
+                  type="radio"
+                  name="tmpl-dropdown"
+                  class="theme-controller btn btn-sm btn-block w-full btn-ghost justify-start"
+                  :aria-label="tmpl"
+                  :value="tmpl"
+                  :checked="config.chat_template === tmpl"
+                  @click="setSelectedTemplate(tmpl)" />
+              </li>
+            </ul>
+          </div>
         </div>
       </div>
 
@@ -214,6 +244,10 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
             <div class="collapse-content">
               <!-- Samplers queue -->
               <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
+              <!-- Prefix -->
+              <settings-modal-short-input label="Prefix" :config-key="'input_prefix'" :config-default="configDefault" :config-info="configInfo" v-model="config.input_prefix"></settings-modal-short-input>
+              <!-- Suffix -->
+              <settings-modal-short-input label="Suffix" :config-key="'input_suffix'" :config-default="configDefault" :config-info="configInfo" v-model="config.input_suffix"></settings-modal-short-input>
               <!-- Samplers -->
               <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
                 <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
@@ -285,6 +319,9 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
       // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
       apiKey: '',
       systemMessage: 'You are a helpful assistant.',
+      chat_template: 'chatml',
+      input_prefix: '',
+      input_suffix: '',
       // make sure these default values are in sync with `common.h`
       samplers: 'dkypmxt',
       temperature: 0.8,
@@ -310,6 +347,9 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
     const CONFIG_INFO = {
       apiKey: 'Set the API Key if you are using --api-key option for the server.',
       systemMessage: 'The starting message that defines how model should behave.',
+      chat_template: 'The fromat used for messages.',
+      input_prefix: 'Prefix for user messages in custom chat templates.',
+      input_suffix: 'Suffix for user messages in custom chat templates.',
       samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
       temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
       dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
@@ -335,6 +375,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
     const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
     // list of themes supported by daisyui
     const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
+    const CHAT_TEMPLATES = ['chatml', 'llama2', 'mistral', 'phi3', 'zephyr', 'monarch', 'gemma', 'gemma2', 'orion', 'openchat', 'vicuna', 'vicuna-orca', 'deepseek', 'command-r', 'llama3', 'chatglm3', 'chatglm4', 'minicpm', 'deepseek2', 'exaone3', 'rwkv-world', 'granite', 'custom'];
 
     // markdown support
     const VueMarkdown = defineComponent(
@@ -481,6 +522,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
           editingMsg: null,
           // const
           themes: THEMES,
+          templates: CHAT_TEMPLATES,
           configDefault: {...CONFIG_DEFAULT},
           configInfo: {...CONFIG_INFO},
         }
@@ -500,6 +542,9 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
           this.selectedTheme = theme;
           StorageUtils.setTheme(theme);
         },
+        setSelectedTemplate(template) {
+          this.config.chat_template = template;
+        },
         newConversation() {
           if (this.isGenerating) return;
           this.viewingConvId = StorageUtils.getNewConvId();
@@ -559,6 +604,9 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
               stream: true,
               cache_prompt: true,
               samplers: this.config.samplers,
+              chat_template: this.config.chat_template,
+              input_prefix: this.config.input_prefix,
+              input_suffix: this.config.input_suffix,
               temperature: this.config.temperature,
               dynatemp_range: this.config.dynatemp_range,
               dynatemp_exponent: this.config.dynatemp_exponent,
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c0f8c7e736e65..644649715b252 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1147,6 +1147,9 @@ struct server_context {
             {"model",                     params.model_alias},
             {"seed",                      slot.sparams.seed},
             {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
+            {"chat_template",             params.chat_template},
+            {"input_prefix",              params.input_prefix},
+            {"input_suffix",              params.input_suffix},
             {"temperature",               slot.sparams.temp},
             {"dynatemp_range",            slot.sparams.dynatemp_range},
             {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
@@ -3218,7 +3221,9 @@ int main(int argc, char ** argv) {
 
     LOG_INF("%s: model loaded\n", __func__);
 
-    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+    // if a standard chat template is not chosen, check prefix and suffix to switch to custom template
+    // otherwise use the one that comes with the model (if any)
+    // if a standard chat template is chosen, warn about prefix and suffix not being used
     if (params.chat_template.empty()) {
         if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
             LOG_WRN("%s: Prefix and suffix are used instead of a chat template. This may cause the model to output suboptimal responses\n", __func__);
@@ -3227,13 +3232,16 @@ int main(int argc, char ** argv) {
             LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
             params.chat_template = "chatml";
         }
-    } else if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
-        LOG_WRN("%s: Prefix and suffix are not used because a chat template is defined.\n", __func__);
+    } else if (params.chat_template != "custom" && 
+              (!params.input_prefix.empty() || !params.input_suffix.empty())) {
+        LOG_WRN("%s: Prefix and suffix are defined, but will not be used because a standard chat template is chosen.\n", __func__);
     } else {
-        // print sample chat example to make it clear which template is used
-        LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
+        LOG_WRN("%s: Custom chat template is chosen. This may cause the model to output suboptimal responses\n", __func__);
     }
 
+    // print sample chat example to make it clear which template is used
+    LOG_INF("%s: chat template: '%s', built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.c_str(), params.chat_template.empty(), format_chat_example(ctx_server.model, params.chat_template, params.input_prefix, params.input_suffix).c_str());
+
     ctx_server.queue_tasks.on_new_task(std::bind(
                 &server_context::process_single_task, &ctx_server, std::placeholders::_1));
 
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d0d9775cfbe87..89ec4f3356e73 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -299,7 +299,7 @@ static llama_tokens format_infill(
     return embd_inp;
 }
 
-// Format given chat. If tmpl is empty, we take the template from model metadata
+// Format given chat. If tmpl is empty, we either use prefix and suffix (if defined), or take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::string & prefix, const std::string & suffix, const std::vector<json> & messages) {
     std::vector<common_chat_msg> chat;
     std::string formatted_chat;
@@ -331,16 +331,38 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
             if (role == "user") formatted_chat += prefix + content + suffix;
             else formatted_chat += content;
         } else {
-            chat.push_back({role, content}); 
+            chat.push_back({role, content});
         }
     }
 
     if (tmpl != "custom") formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
-    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
+    LOG_WRN("formatted_chat using '%s': '%s'\n", tmpl.c_str(), formatted_chat.c_str());
 
     return formatted_chat;
 }
 
+inline std::string format_chat_example(const struct llama_model * model, const std::string & tmpl, const std::string & prefix, const std::string & suffix) {
+    std::vector<common_chat_msg> msgs = {
+        {"system",    "You are a helpful assistant"},
+        {"user",      "Hello"},
+        {"assistant", "Hi there"},
+        {"user",      "How are you?"},
+    };
+
+    std::string formatted_example;
+
+    if (tmpl == "custom") {
+        for (auto message : msgs) {
+            if (message.role == "user") formatted_example += prefix + message.content + suffix;
+            else formatted_example += message.content;
+        }
+    } else {
+        formatted_example = common_chat_apply_template(model, tmpl, msgs, true);
+    }
+
+    return formatted_example;
+}
+
 static std::string llama_get_chat_template(const struct llama_model * model) {
     std::string template_key = "tokenizer.chat_template";
     // call with NULL buffer to get the total size of the string

From 73f435fdfcf4e3156f07e82a151713ee1754e032 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Thu, 21 Nov 2024 21:55:45 +0500
Subject: [PATCH 07/10] Simplified logics and UI

* removed "custom" template
* fixed reading template, prefix and suffix from payload
* removed `chat_template` from UI
---
 examples/server/public/index.html | 38 -------------------------------
 examples/server/server.cpp        | 16 ++++---------
 examples/server/utils.hpp         | 23 +++++++++++++++----
 3 files changed, 24 insertions(+), 53 deletions(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index d65c540cfc682..27570a1eeea67 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -119,36 +119,6 @@ <h2 class="font-bold mb-4 ml-4">Conversations</h2>
               </li>
             </ul>
           </div>
-
-          <!-- Templates -->
-          <div class="dropdown dropdown-end dropdown-bottom">
-            <div tabindex="0" role="button" class="btn m-1">
-              Templates
-              <svg width="12px" height="12px" class="inline-block h-2 w-2 fill-current opacity-60" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2048 2048">
-                <path d="M1799 349l242 241-1017 1017L7 590l242-241 775 775 775-775z"></path>
-              </svg>
-            </div>
-            <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
-              <li>
-                <button
-                  class="btn btn-sm btn-block w-full btn-ghost justify-start"
-                  :class="{ 'btn-active': config.chat_template === 'chatml' }"
-                  @click="config.chat_template = 'chatml'">
-                  auto
-                </button>
-              </li>
-              <li v-for="tmpl in templates">
-                <input
-                  type="radio"
-                  name="tmpl-dropdown"
-                  class="theme-controller btn btn-sm btn-block w-full btn-ghost justify-start"
-                  :aria-label="tmpl"
-                  :value="tmpl"
-                  :checked="config.chat_template === tmpl"
-                  @click="setSelectedTemplate(tmpl)" />
-              </li>
-            </ul>
-          </div>
         </div>
       </div>
 
@@ -319,7 +289,6 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
       // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
       apiKey: '',
       systemMessage: 'You are a helpful assistant.',
-      chat_template: 'chatml',
       input_prefix: '',
       input_suffix: '',
       // make sure these default values are in sync with `common.h`
@@ -347,7 +316,6 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
     const CONFIG_INFO = {
       apiKey: 'Set the API Key if you are using --api-key option for the server.',
       systemMessage: 'The starting message that defines how model should behave.',
-      chat_template: 'The fromat used for messages.',
       input_prefix: 'Prefix for user messages in custom chat templates.',
       input_suffix: 'Suffix for user messages in custom chat templates.',
       samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
@@ -375,7 +343,6 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
     const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
     // list of themes supported by daisyui
     const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
-    const CHAT_TEMPLATES = ['chatml', 'llama2', 'mistral', 'phi3', 'zephyr', 'monarch', 'gemma', 'gemma2', 'orion', 'openchat', 'vicuna', 'vicuna-orca', 'deepseek', 'command-r', 'llama3', 'chatglm3', 'chatglm4', 'minicpm', 'deepseek2', 'exaone3', 'rwkv-world', 'granite', 'custom'];
 
     // markdown support
     const VueMarkdown = defineComponent(
@@ -522,7 +489,6 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
           editingMsg: null,
           // const
           themes: THEMES,
-          templates: CHAT_TEMPLATES,
           configDefault: {...CONFIG_DEFAULT},
           configInfo: {...CONFIG_INFO},
         }
@@ -542,9 +508,6 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
           this.selectedTheme = theme;
           StorageUtils.setTheme(theme);
         },
-        setSelectedTemplate(template) {
-          this.config.chat_template = template;
-        },
         newConversation() {
           if (this.isGenerating) return;
           this.viewingConvId = StorageUtils.getNewConvId();
@@ -604,7 +567,6 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
               stream: true,
               cache_prompt: true,
               samplers: this.config.samplers,
-              chat_template: this.config.chat_template,
               input_prefix: this.config.input_prefix,
               input_suffix: this.config.input_suffix,
               temperature: this.config.temperature,
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 644649715b252..85a65efb9e14f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -798,6 +798,7 @@ struct server_context {
             slot.oaicompat = false;
             slot.oaicompat_model = "";
         }
+        std::string default_empty = "";
 
         slot.params.stream              = json_value(data, "stream",             false);
         slot.params.cache_prompt        = json_value(data, "cache_prompt",       false);
@@ -1147,9 +1148,6 @@ struct server_context {
             {"model",                     params.model_alias},
             {"seed",                      slot.sparams.seed},
             {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
-            {"chat_template",             params.chat_template},
-            {"input_prefix",              params.input_prefix},
-            {"input_suffix",              params.input_suffix},
             {"temperature",               slot.sparams.temp},
             {"dynatemp_range",            slot.sparams.dynatemp_range},
             {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
@@ -3221,22 +3219,18 @@ int main(int argc, char ** argv) {
 
     LOG_INF("%s: model loaded\n", __func__);
 
-    // if a standard chat template is not chosen, check prefix and suffix to switch to custom template
+    // if a standard chat template is not chosen, check prefix and suffix to switch to custom formatting
     // otherwise use the one that comes with the model (if any)
     // if a standard chat template is chosen, warn about prefix and suffix not being used
     if (params.chat_template.empty()) {
         if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
-            LOG_WRN("%s: Prefix and suffix are used instead of a chat template. This may cause the model to output suboptimal responses\n", __func__);
-            params.chat_template = "custom";
+            LOG_WRN("%s: Prefix and suffix will be used for a custom chat template. This may cause the model to output suboptimal responses\n", __func__);
         } else if (!ctx_server.validate_model_chat_template()) {
             LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
             params.chat_template = "chatml";
         }
-    } else if (params.chat_template != "custom" && 
-              (!params.input_prefix.empty() || !params.input_suffix.empty())) {
-        LOG_WRN("%s: Prefix and suffix are defined, but will not be used because a standard chat template is chosen.\n", __func__);
-    } else {
-        LOG_WRN("%s: Custom chat template is chosen. This may cause the model to output suboptimal responses\n", __func__);
+    } else if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
+        LOG_WRN("%s: Prefix and suffix are defined, but will not be used because a chat template '%s' is chosen.\n", __func__, params.chat_template.c_str());
     }
 
     // print sample chat example to make it clear which template is used
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 89ec4f3356e73..6c51b9f123312 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -304,6 +304,8 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
     std::vector<common_chat_msg> chat;
     std::string formatted_chat;
 
+    bool is_custom = !prefix.empty() || !suffix.empty();
+
     for (size_t i = 0; i < messages.size(); ++i) {
         const auto & curr_msg = messages[i];
 
@@ -326,7 +328,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
             throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
         }
 
-        if (tmpl == "custom") {
+        if (is_custom) {
             // simple format using prefix and suffix
             if (role == "user") formatted_chat += prefix + content + suffix;
             else formatted_chat += content;
@@ -335,7 +337,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
         }
     }
 
-    if (tmpl != "custom") formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
+    if (!is_custom) formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
     LOG_WRN("formatted_chat using '%s': '%s'\n", tmpl.c_str(), formatted_chat.c_str());
 
     return formatted_chat;
@@ -351,7 +353,7 @@ inline std::string format_chat_example(const struct llama_model * model, const s
 
     std::string formatted_example;
 
-    if (tmpl == "custom") {
+    if (!prefix.empty() || !suffix.empty()) {
         for (auto message : msgs) {
             if (message.role == "user") formatted_example += prefix + message.content + suffix;
             else formatted_example += message.content;
@@ -634,7 +636,20 @@ static json oaicompat_completion_params_parse(
     llama_params["__oaicompat"] = true;
 
     // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, input_prefix, input_suffix, body.at("messages"));
+    std::string chat_tmpl = chat_template;
+    std::string prefix = "";
+    std::string suffix = "";
+
+    // if template is sent in data, ignore prefix and suffix
+    if (body.contains("chat_template")) {
+        chat_tmpl = body.at("chat_template").get<std::string>();
+        LOG_WRN("\nUsing '%s' template, prefix and suffix are ignored.\n", chat_tmpl.c_str());
+    } else {
+        prefix = (body.contains("input_prefix") ? body.at("input_prefix").get<std::string>() : input_prefix);
+        suffix = (body.contains("input_suffix") ? body.at("input_suffix").get<std::string>() : input_suffix);
+    }
+
+    llama_params["prompt"] = format_chat(model, chat_tmpl, prefix, suffix, body.at("messages"));
 
     // Handle "stop" field
     if (body.contains("stop") && body.at("stop").is_string()) {

From 2f0a01465acd9f7cde78046a947f51978ff776c1 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Thu, 21 Nov 2024 21:58:36 +0500
Subject: [PATCH 08/10] Cleanup of unused features

---
 examples/server/server.cpp | 1 -
 examples/server/utils.hpp  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 85a65efb9e14f..a7a86548b9b03 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -798,7 +798,6 @@ struct server_context {
             slot.oaicompat = false;
             slot.oaicompat_model = "";
         }
-        std::string default_empty = "";
 
         slot.params.stream              = json_value(data, "stream",             false);
         slot.params.cache_prompt        = json_value(data, "cache_prompt",       false);
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 6c51b9f123312..d82bc6464ca33 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -338,7 +338,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
     }
 
     if (!is_custom) formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
-    LOG_WRN("formatted_chat using '%s': '%s'\n", tmpl.c_str(), formatted_chat.c_str());
+    LOG_DBG("formatted_chat using '%s': '%s'\n", tmpl.c_str(), formatted_chat.c_str());
 
     return formatted_chat;
 }

From b2cf6e73fc9e54929c3f0e5019026b7ab9e29625 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Thu, 21 Nov 2024 22:26:06 +0500
Subject: [PATCH 09/10] Fixed prefix and suffix compatibility

* if payload contains empty prefix or suffix, use arguments instead
---
 examples/server/utils.hpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d82bc6464ca33..1d89a8262ff48 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -636,17 +636,24 @@ static json oaicompat_completion_params_parse(
     llama_params["__oaicompat"] = true;
 
     // Apply chat template to the list of messages
-    std::string chat_tmpl = chat_template;
-    std::string prefix = "";
-    std::string suffix = "";
+    std::string chat_tmpl = (body.contains("chat_template") ? body.at("chat_template").get<std::string>() : chat_template);
+    std::string prefix = (body.contains("input_prefix") ? body.at("input_prefix").get<std::string>() : "");
+    std::string suffix = (body.contains("input_suffix") ? body.at("input_suffix").get<std::string>() : "");
 
     // if template is sent in data, ignore prefix and suffix
-    if (body.contains("chat_template")) {
-        chat_tmpl = body.at("chat_template").get<std::string>();
+    if (!chat_tmpl.empty()) {
         LOG_WRN("\nUsing '%s' template, prefix and suffix are ignored.\n", chat_tmpl.c_str());
+        prefix = "";
+        suffix = "";
     } else {
-        prefix = (body.contains("input_prefix") ? body.at("input_prefix").get<std::string>() : input_prefix);
-        suffix = (body.contains("input_suffix") ? body.at("input_suffix").get<std::string>() : input_suffix);
+        if (prefix.empty()) {
+            prefix = input_prefix;
+        }
+
+        if (suffix.empty()) {
+            suffix = input_suffix;
+        }
+        LOG_WRN("\nUsing prefix '%s' and suffix '%s'.\n", prefix.c_str(), suffix.c_str());
     }
 
     llama_params["prompt"] = format_chat(model, chat_tmpl, prefix, suffix, body.at("messages"));

From d92f518253a5eaed36bcf98aefd99a716a5e5b08 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Thu, 21 Nov 2024 22:46:44 +0500
Subject: [PATCH 10/10] Simplify logics even further

* if no `chat_template` is passed, we can rely on `common_chat_apply_template` function
---
 examples/server/server.cpp |  1 -
 examples/server/utils.hpp  | 30 ++++++++++++++----------------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a7a86548b9b03..5ccc3ae0d2923 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3226,7 +3226,6 @@ int main(int argc, char ** argv) {
             LOG_WRN("%s: Prefix and suffix will be used for a custom chat template. This may cause the model to output suboptimal responses\n", __func__);
         } else if (!ctx_server.validate_model_chat_template()) {
             LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
-            params.chat_template = "chatml";
         }
     } else if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
         LOG_WRN("%s: Prefix and suffix are defined, but will not be used because a chat template '%s' is chosen.\n", __func__, params.chat_template.c_str());
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 1d89a8262ff48..affb1d6697905 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -304,7 +304,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
     std::vector<common_chat_msg> chat;
     std::string formatted_chat;
 
-    bool is_custom = !prefix.empty() || !suffix.empty();
+    bool is_custom = tmpl.empty() && (!prefix.empty() || !suffix.empty());
 
     for (size_t i = 0; i < messages.size(); ++i) {
         const auto & curr_msg = messages[i];
@@ -337,7 +337,13 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
         }
     }
 
-    if (!is_custom) formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
+    if (!is_custom) {
+        LOG_WRN("Using '%s' template, prefix and suffix are ignored.\n", tmpl.c_str());
+        formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
+    } else {
+        LOG_WRN("Used prefix '%s' and suffix '%s'.\n", prefix.c_str(), suffix.c_str());
+    }
+
     LOG_DBG("formatted_chat using '%s': '%s'\n", tmpl.c_str(), formatted_chat.c_str());
 
     return formatted_chat;
@@ -353,7 +359,7 @@ inline std::string format_chat_example(const struct llama_model * model, const s
 
     std::string formatted_example;
 
-    if (!prefix.empty() || !suffix.empty()) {
+    if (tmpl.empty() && (!prefix.empty() || !suffix.empty())) {
         for (auto message : msgs) {
             if (message.role == "user") formatted_example += prefix + message.content + suffix;
             else formatted_example += message.content;
@@ -640,20 +646,12 @@ static json oaicompat_completion_params_parse(
     std::string prefix = (body.contains("input_prefix") ? body.at("input_prefix").get<std::string>() : "");
     std::string suffix = (body.contains("input_suffix") ? body.at("input_suffix").get<std::string>() : "");
 
-    // if template is sent in data, ignore prefix and suffix
-    if (!chat_tmpl.empty()) {
-        LOG_WRN("\nUsing '%s' template, prefix and suffix are ignored.\n", chat_tmpl.c_str());
-        prefix = "";
-        suffix = "";
-    } else {
-        if (prefix.empty()) {
-            prefix = input_prefix;
-        }
+    if (prefix.empty()) {
+        prefix = input_prefix;
+    }
 
-        if (suffix.empty()) {
-            suffix = input_suffix;
-        }
-        LOG_WRN("\nUsing prefix '%s' and suffix '%s'.\n", prefix.c_str(), suffix.c_str());
+    if (suffix.empty()) {
+        suffix = input_suffix;
     }
 
     llama_params["prompt"] = format_chat(model, chat_tmpl, prefix, suffix, body.at("messages"));