From b5dbcf6f5ebdb745032797e9ae8e03264845e4f7 Mon Sep 17 00:00:00 2001 From: kalomaze <66376113+kalomaze@users.noreply.github.com> Date: Tue, 2 Apr 2024 17:10:46 -0500 Subject: [PATCH] Smoothing factor backport --- common/sampling.cpp | 5 +++-- common/sampling.h | 1 + examples/server/server.cpp | 1 + llama.cpp | 17 +++++++++++++++-- llama.h | 5 +++-- 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 45d68b26c2b93..c4c63678f3bd2 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -132,6 +132,7 @@ static void sampler_queue( const float temp = params.temp; const float dynatemp_range = params.dynatemp_range; const float dynatemp_exponent = params.dynatemp_exponent; + const float smoothing_factor = params.smoothing_factor; const int32_t top_k = params.top_k; const float top_p = params.top_p; const float min_p = params.min_p; @@ -147,10 +148,10 @@ static void sampler_queue( case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; case llama_sampler_type::TEMPERATURE: - if (dynatemp_range > 0) { + if (dynatemp_range > 0 || smoothing_factor > 0) { float dynatemp_min = std::max(0.0f, temp - dynatemp_range); float dynatemp_max = std::max(0.0f, temp + dynatemp_range); - llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); + llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor); } else { llama_sample_temp(ctx_main, &cur_p, temp); } diff --git a/common/sampling.h b/common/sampling.h index 56ed991b8478a..e34cc3125c9f3 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -31,6 +31,7 @@ typedef struct llama_sampling_params { float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + float smoothing_factor = 0.0f; // controls the quadratic adjustment in smooth sampling int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 92090b92028aa..d79065ca1918a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -839,6 +839,7 @@ struct server_context { slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); + slot.sparams.smoothing_factor = json_value(data, "smoothing_factor", default_sparams.smoothing_factor); slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); diff --git a/llama.cpp b/llama.cpp index 21e7a067af65f..e4091bd161a5d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12183,14 +12183,27 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } -void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) { +void llama_sample_entropy(struct llama_context* ctx, llama_token_data_array* candidates_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) { const int64_t t_start_sample_us = ggml_time_us(); // no need to do anything if there is only one (or zero) candidates - if(candidates_p->size <= 1) { + if (candidates_p->size <= 1) { return; } + // Apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise. + if (smoothing_factor > 0 && candidates_p->size > 1) { + llama_sample_softmax(ctx, candidates_p); + float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation + + // Apply quadratic transformation using the smoothing_factor + for (size_t i = 0; i < candidates_p->size; ++i) { + float logit_shifted = candidates_p->data[i].logit - h; + candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h; + } + llama_sample_softmax(ctx, candidates_p); + } + // Calculate maximum possible entropy float max_entropy = -logf(1.0f / candidates_p->size); diff --git a/llama.h b/llama.h index f061d014ca8eb..03ccd23d71d00 100644 --- a/llama.h +++ b/llama.h @@ -864,13 +864,14 @@ extern "C" { float p, size_t min_keep); - /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. + /// @details Dynamic temperature implementation + Smooth Sampling implementations wrapped into one function, no research papers available LLAMA_API void llama_sample_entropy( struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, - float exponent_val); + float exponent_val, + float smoothing_factor); LLAMA_API void llama_sample_temp( struct llama_context * ctx,