From ea15108462c88dd76042960ffd28799ffd395925 Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Tue, 16 Jan 2024 20:58:41 +0900 Subject: [PATCH] implemented dynamic temperature sampling from koboldcpp --- common/sampling.cpp | 21 +++++++++++++- common/sampling.h | 1 + llama.cpp | 68 +++++++++++++++++++++++++++++++++++++++++++++ llama.h | 8 ++++++ 4 files changed, 97 insertions(+), 1 deletion(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index dd1ffeb1b8083..6bc2407a329cf 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -129,6 +129,7 @@ static void sampler_queue( const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); const float temp = params.temp; + const float dynatemp_range = params.dynatemp_range; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const float top_p = params.top_p; const float min_p = params.min_p; @@ -143,7 +144,25 @@ static void sampler_queue( case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case 't': llama_sample_temp (ctx_main, &cur_p, temp); break; + + case 't': + if (dynatemp_range>0) + { + float dynatemp_min = temp - dynatemp_range; + float dynatemp_max = temp + dynatemp_range; + //do not allow negative values + dynatemp_min = dynatemp_min<0?0:dynatemp_min; + dynatemp_max = dynatemp_max<0?0:dynatemp_max; + + llama_sample_entropy(ctx_main, &cur_p, temp, dynatemp_min, dynatemp_max); + } + else + { + llama_sample_temp(ctx_main, &cur_p, temp); + } + + break; + default : break; } } diff --git a/common/sampling.h b/common/sampling.h index f16ef97e34a10..d95e36433cc4a 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -18,6 +18,7 @@ typedef struct llama_sampling_params { float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // 1.0 = disabled + float dynatemp_range = 0.00f; // 0.0 = disabled int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float penalty_repeat = 1.10f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled diff --git a/llama.cpp b/llama.cpp index 46c4d11c88873..79ec664c6e562 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7779,6 +7779,74 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } +void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float min_temp = 0, float max_temp = 2.0f) { + const int64_t t_start_sample_us = ggml_time_us(); + + llama_sample_softmax(ctx, candidates_p); + + float exponent_val = 1.0f; + + // Calculate entropy of the softmax probabilities + float entropy = 0.0f; + for (size_t i = 0; i < candidates_p->size; ++i) { + float prob = candidates_p->data[i].p; + if (prob > 0.0f) { // Ensure no log(0) + entropy -= prob * logf(prob); + } + } + + // Calculate maximum possible entropy + float max_entropy = -logf(1.0f / candidates_p->size); + + // Guard against division by zero + if (max_entropy == 0.0f) { + max_entropy = 1.0f; // This ensures that normalized_entropy will be 0 when entropy is 0 + } + + // Normalize the entropy + float normalized_entropy = entropy / max_entropy; + + // Map the normalized entropy to the desired temperature range using the power function + float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); + + // //todo: Ensure to hide print statements unless debugging! + // printf("Your text maxtemp value is: %f\n", max_temp); + // // Print the variables + // printf("Entropy: %f\n", entropy); + // printf("Max Possible Entropy: %f\n", max_entropy); + // printf("Normalized Entropy: %f\n", normalized_entropy); + // printf("Exponent: %f\n", exponent_val); + // printf("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); + + // Apply the dynamically calculated temperature scaling + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].logit /= dyn_temp; + } + + // Re-compute softmax probabilities after scaling logits with dynamic temperature + double max_l_double = candidates_p->data[0].logit; + double cum_sum_double = 0.0; + for (size_t i = 0; i < candidates_p->size; ++i) { + double p = exp(candidates_p->data[i].logit - max_l_double); + candidates_p->data[i].p = p; // Store the scaled probability + cum_sum_double += p; + } + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities + } + + // //todo: Ensure to hide print statements unless debugging! + // // Print the updated top 25 probabilities after temperature scaling + // printf("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); + // for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) { + // printf("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f); + // } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = ggml_time_us(); diff --git a/llama.h b/llama.h index a570b0d6968fb..37b48fa6cd266 100644 --- a/llama.h +++ b/llama.h @@ -770,6 +770,14 @@ extern "C" { float p, size_t min_keep); + /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. + LLAMA_API void llama_sample_entropy( + struct llama_context * ctx, + llama_token_data_array * candidates_p, + float temp, + float min_temp, + float max_temp); + LLAMA_API void llama_sample_temp( struct llama_context * ctx, llama_token_data_array * candidates,