Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: (web UI) Add custom chat formatting that uses input_prefix and input_suffix #10425

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -852,15 +852,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_prefix = value;
params.enable_chat_template = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
add_opt(common_arg(
{"--in-suffix"}, "STRING",
"string to suffix after user inputs with (default: empty)",
[](common_params & params, const std::string & value) {
params.input_suffix = value;
params.enable_chat_template = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
add_opt(common_arg(
{"--no-warmup"},
"skip warming up the model with an empty run",
Expand Down
10 changes: 10 additions & 0 deletions examples/server/public/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,10 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
<div class="collapse-content">
<!-- Samplers queue -->
<settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
<!-- Prefix -->
<settings-modal-short-input label="Prefix" :config-key="'input_prefix'" :config-default="configDefault" :config-info="configInfo" v-model="config.input_prefix"></settings-modal-short-input>
<!-- Suffix -->
<settings-modal-short-input label="Suffix" :config-key="'input_suffix'" :config-default="configDefault" :config-info="configInfo" v-model="config.input_suffix"></settings-modal-short-input>
<!-- Samplers -->
<template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
Expand Down Expand Up @@ -285,6 +289,8 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
apiKey: '',
systemMessage: 'You are a helpful assistant.',
input_prefix: '',
input_suffix: '',
// make sure these default values are in sync with `common.h`
samplers: 'dkypmxt',
temperature: 0.8,
Expand All @@ -310,6 +316,8 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
const CONFIG_INFO = {
apiKey: 'Set the API Key if you are using --api-key option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
input_prefix: 'Prefix for user messages in custom chat templates.',
input_suffix: 'Suffix for user messages in custom chat templates.',
samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
Expand Down Expand Up @@ -559,6 +567,8 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
stream: true,
cache_prompt: true,
samplers: this.config.samplers,
input_prefix: this.config.input_prefix,
input_suffix: this.config.input_suffix,
temperature: this.config.temperature,
dynatemp_range: this.config.dynatemp_range,
dynatemp_exponent: this.config.dynatemp_exponent,
Expand Down
15 changes: 10 additions & 5 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2829,7 +2829,7 @@ int main(int argc, char ** argv) {
return;
}

json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template, params.input_prefix, params.input_suffix);

std::vector<server_task> tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION);
ctx_server.queue_results.add_waiting_tasks(tasks);
Expand Down Expand Up @@ -3218,16 +3218,21 @@ int main(int argc, char ** argv) {

LOG_INF("%s: model loaded\n", __func__);

// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
// if a standard chat template is not chosen, check prefix and suffix to switch to custom formatting
// otherwise use the one that comes with the model (if any)
// if a standard chat template is chosen, warn about prefix and suffix not being used
if (params.chat_template.empty()) {
if (!ctx_server.validate_model_chat_template()) {
if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
MaggotHATE marked this conversation as resolved.
Show resolved Hide resolved
LOG_WRN("%s: Prefix and suffix will be used for a custom chat template. This may cause the model to output suboptimal responses\n", __func__);
} else if (!ctx_server.validate_model_chat_template()) {
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
params.chat_template = "chatml";
}
} else if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
LOG_WRN("%s: Prefix and suffix are defined, but will not be used because a chat template '%s' is chosen.\n", __func__, params.chat_template.c_str());
}

// print sample chat example to make it clear which template is used
LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
LOG_INF("%s: chat template: '%s', built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.c_str(), params.chat_template.empty(), format_chat_example(ctx_server.model, params.chat_template, params.input_prefix, params.input_suffix).c_str());

ctx_server.queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
Expand Down
65 changes: 58 additions & 7 deletions examples/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,9 +299,12 @@ static llama_tokens format_infill(
return embd_inp;
}

// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
// Format given chat. If tmpl is empty, we either use prefix and suffix (if defined), or take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::string & prefix, const std::string & suffix, const std::vector<json> & messages) {
std::vector<common_chat_msg> chat;
std::string formatted_chat;

bool is_custom = tmpl.empty() && (!prefix.empty() || !suffix.empty());

for (size_t i = 0; i < messages.size(); ++i) {
const auto & curr_msg = messages[i];
Expand All @@ -325,15 +328,49 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
}

chat.push_back({role, content});
if (is_custom) {
// simple format using prefix and suffix
if (role == "user") formatted_chat += prefix + content + suffix;
else formatted_chat += content;
} else {
chat.push_back({role, content});
}
}

if (!is_custom) {
LOG_WRN("Using '%s' template, prefix and suffix are ignored.\n", tmpl.c_str());
formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
} else {
LOG_WRN("Used prefix '%s' and suffix '%s'.\n", prefix.c_str(), suffix.c_str());
}

const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
LOG_DBG("formatted_chat using '%s': '%s'\n", tmpl.c_str(), formatted_chat.c_str());

return formatted_chat;
}

inline std::string format_chat_example(const struct llama_model * model, const std::string & tmpl, const std::string & prefix, const std::string & suffix) {
std::vector<common_chat_msg> msgs = {
{"system", "You are a helpful assistant"},
{"user", "Hello"},
{"assistant", "Hi there"},
{"user", "How are you?"},
};

std::string formatted_example;

if (tmpl.empty() && (!prefix.empty() || !suffix.empty())) {
for (auto message : msgs) {
if (message.role == "user") formatted_example += prefix + message.content + suffix;
else formatted_example += message.content;
}
} else {
formatted_example = common_chat_apply_template(model, tmpl, msgs, true);
}

return formatted_example;
}

static std::string llama_get_chat_template(const struct llama_model * model) {
std::string template_key = "tokenizer.chat_template";
// call with NULL buffer to get the total size of the string
Expand Down Expand Up @@ -597,13 +634,27 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
static json oaicompat_completion_params_parse(
const struct llama_model * model,
const json & body, /* openai api json semantics */
const std::string & chat_template) {
const std::string & chat_template,
const std::string & input_prefix,
const std::string & input_suffix) {
json llama_params;

llama_params["__oaicompat"] = true;

// Apply chat template to the list of messages
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
std::string chat_tmpl = (body.contains("chat_template") ? body.at("chat_template").get<std::string>() : chat_template);
std::string prefix = (body.contains("input_prefix") ? body.at("input_prefix").get<std::string>() : "");
std::string suffix = (body.contains("input_suffix") ? body.at("input_suffix").get<std::string>() : "");

if (prefix.empty()) {
prefix = input_prefix;
}

if (suffix.empty()) {
suffix = input_suffix;
}

llama_params["prompt"] = format_chat(model, chat_tmpl, prefix, suffix, body.at("messages"));

// Handle "stop" field
if (body.contains("stop") && body.at("stop").is_string()) {
Expand Down
Loading