Skip to content

Implementation of dynamic temperature sampling as seen in KoboldCpp #4972

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ static void sampler_queue(
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
Expand All @@ -143,7 +145,15 @@ static void sampler_queue(
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
case 't':
if (dynatemp_range > 0) {
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
} else {
llama_sample_temp(ctx_main, &cur_p, temp);
}
break;
default : break;
}
}
Expand Down
2 changes: 2 additions & 0 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.10f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
Expand Down
67 changes: 67 additions & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7783,6 +7783,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
}
}

void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
const int64_t t_start_sample_us = ggml_time_us();

// no need to do anything if there is only one (or zero) candidates
if(candidates_p->size <= 1) {
return;
}

// Calculate maximum possible entropy
float max_entropy = -logf(1.0f / candidates_p->size);

llama_sample_softmax(nullptr, candidates_p);

// Calculate entropy of the softmax probabilities
float entropy = 0.0f;
for (size_t i = 0; i < candidates_p->size; ++i) {
float prob = candidates_p->data[i].p;
if (prob > 0.0f) { // Ensure no log(0)
entropy -= prob * logf(prob);
}
}

// Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
float normalized_entropy = entropy / max_entropy;

// Map the normalized entropy to the desired temperature range using the power function
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);

#ifdef DEBUG
LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
LLAMA_LOG_INFO("Entropy: %f\n", entropy);
LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
#endif

// Apply the dynamically calculated temperature scaling
for (size_t i = 0; i < candidates_p->size; ++i) {
candidates_p->data[i].logit /= dyn_temp;
}

// Re-compute softmax probabilities after scaling logits with dynamic temperature
double max_l_double = candidates_p->data[0].logit;
double cum_sum_double = 0.0;
for (size_t i = 0; i < candidates_p->size; ++i) {
double p = exp(candidates_p->data[i].logit - max_l_double);
candidates_p->data[i].p = p; // Store the scaled probability
cum_sum_double += p;
}
for (size_t i = 0; i < candidates_p->size; ++i) {
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
}

#ifdef DEBUG
// Print the updated top 25 probabilities after temperature scaling
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
}
#endif

if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}

void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
const int64_t t_start_sample_us = ggml_time_us();

Expand Down
8 changes: 8 additions & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,14 @@ extern "C" {
float p,
size_t min_keep);

/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
LLAMA_API void llama_sample_entropy(
struct llama_context * ctx,
llama_token_data_array * candidates_p,
float min_temp,
float max_temp,
float exponent_val);

LLAMA_API void llama_sample_temp(
struct llama_context * ctx,
llama_token_data_array * candidates,
Expand Down