Skip to content

Commit 4483396

Browse files
authored
llama : apply classifier-free guidance to logits directly (#4951)
1 parent d9aa4ff commit 4483396

File tree

3 files changed

+55
-27
lines changed

3 files changed

+55
-27
lines changed

common/sampling.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,11 @@ static llama_token llama_sampling_sample_impl(
190190
logits[it->first] += it->second;
191191
}
192192

193+
if (ctx_cfg) {
194+
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
195+
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
196+
}
197+
193198
cur.clear();
194199

195200
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -198,10 +203,6 @@ static llama_token llama_sampling_sample_impl(
198203

199204
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
200205

201-
if (ctx_cfg) {
202-
llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
203-
}
204-
205206
// apply penalties
206207
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
207208
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);

llama.cpp

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7898,39 +7898,59 @@ static void llama_log_softmax(float * array, size_t size) {
78987898
}
78997899
}
79007900

7901+
void llama_sample_apply_guidance(
7902+
struct llama_context * ctx,
7903+
float * logits,
7904+
float * logits_guidance,
7905+
float scale) {
7906+
GGML_ASSERT(ctx);
7907+
7908+
const auto t_start_sample_us = ggml_time_us();
7909+
const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
7910+
7911+
llama_log_softmax(logits, n_vocab);
7912+
llama_log_softmax(logits_guidance, n_vocab);
7913+
7914+
for (int i = 0; i < n_vocab; ++i) {
7915+
auto & l = logits[i];
7916+
const auto & g = logits_guidance[i];
7917+
7918+
l = scale * (l - g) + g;
7919+
}
7920+
7921+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7922+
}
7923+
79017924
void llama_sample_classifier_free_guidance(
79027925
struct llama_context * ctx,
79037926
llama_token_data_array * candidates,
79047927
struct llama_context * guidance_ctx,
79057928
float scale) {
7906-
int64_t t_start_sample_us = ggml_time_us();
7907-
79087929
GGML_ASSERT(ctx);
7930+
int64_t t_start_sample_us;
79097931

7910-
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
7932+
t_start_sample_us = ggml_time_us();
7933+
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
79117934

7912-
GGML_ASSERT(n_vocab == (int)candidates->size);
7935+
GGML_ASSERT(n_vocab == candidates->size);
79137936
GGML_ASSERT(!candidates->sorted);
79147937

7915-
std::vector<float> logits_base;
7916-
logits_base.reserve(candidates->size);
7917-
for (size_t i = 0; i < candidates->size; ++i) {
7918-
logits_base.push_back(candidates->data[i].logit);
7938+
std::vector<float> logits_base(n_vocab);
7939+
for (size_t i = 0; i < n_vocab; ++i) {
7940+
logits_base[i] = candidates->data[i].logit;
79197941
}
7920-
llama_log_softmax(logits_base.data(), candidates->size);
79217942

7922-
float* logits_guidance = llama_get_logits(guidance_ctx);
7923-
llama_log_softmax(logits_guidance, n_vocab);
7943+
float * logits_guidance = llama_get_logits(guidance_ctx);
79247944

7925-
for (int i = 0; i < n_vocab; ++i) {
7926-
float logit_guidance = logits_guidance[i];
7927-
float logit_base = logits_base[i];
7928-
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
7929-
}
7945+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7946+
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
7947+
t_start_sample_us = ggml_time_us();
79307948

7931-
if (ctx) {
7932-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7949+
for (size_t i = 0; i < n_vocab; ++i) {
7950+
candidates->data[i].logit = logits_base[i];
79337951
}
7952+
7953+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
79347954
}
79357955

79367956
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {

llama.h

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -714,14 +714,21 @@ extern "C" {
714714
float penalty_present);
715715

716716
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
717-
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
718-
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
719-
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
720-
LLAMA_API void llama_sample_classifier_free_guidance(
717+
/// @param logits Logits extracted from the original generation context.
718+
/// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
719+
/// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
720+
LLAMA_API void llama_sample_apply_guidance(
721+
struct llama_context * ctx,
722+
float * logits,
723+
float * logits_guidance,
724+
float scale);
725+
726+
LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
721727
struct llama_context * ctx,
722728
llama_token_data_array * candidates,
723729
struct llama_context * guidance_ctx,
724-
float scale);
730+
float scale),
731+
"use llama_sample_apply_guidance() instead");
725732

726733
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
727734
LLAMA_API void llama_sample_softmax(

0 commit comments

Comments
 (0)