Skip to content

Commit 8aec522

Browse files
committed
server : token healing for infilling/FIM
1 parent f8ebe38 commit 8aec522

File tree

1 file changed

+23
-15
lines changed

1 file changed

+23
-15
lines changed

examples/server/server.cpp

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2064,6 +2064,8 @@ struct server_context {
20642064
slot.t_start_process_prompt = ggml_time_us();
20652065
slot.t_start_generation = 0;
20662066

2067+
llama_token_healing_output token_healing_out{};
2068+
20672069
if (slot.infill) {
20682070
const bool add_bos = llama_should_add_bos_token(model);
20692071
bool suff_rm_leading_spc = true;
@@ -2083,6 +2085,12 @@ struct server_context {
20832085
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
20842086
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
20852087

2088+
if (slot.sparams.token_healing_enabled) {
2089+
// For FIM roll back only the prefix part (i.e. cursor location)
2090+
token_healing_out = llama_token_healing_rollback(ctx, slot.sparams.token_healing_type,
2091+
prefix_tokens, slot.sparams.token_healing_n_rollback);
2092+
}
2093+
20862094
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
20872095
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
20882096
if (add_bos) {
@@ -2098,6 +2106,11 @@ struct server_context {
20982106
prompt_tokens = embd_inp;
20992107
} else {
21002108
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
2109+
2110+
if (slot.sparams.token_healing_enabled) {
2111+
token_healing_out = llama_token_healing_rollback(ctx, slot.sparams.token_healing_type,
2112+
prompt_tokens, slot.sparams.token_healing_n_rollback);
2113+
}
21012114
}
21022115

21032116
slot.n_past = 0;
@@ -2112,6 +2125,16 @@ struct server_context {
21122125
{"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
21132126
});
21142127

2128+
if (slot.sparams.token_healing_enabled) {
2129+
slot.n_th_prefix = token_healing_out.prefix.size();
2130+
LOG_VERBOSE("token healing prompt", {
2131+
{"id_slot", slot.id},
2132+
{"id_task", slot.id_task},
2133+
{"removed_suffix", token_healing_out.prefix},
2134+
{"n_tokens_removed", token_healing_out.n_tokens_removed}
2135+
});
2136+
}
2137+
21152138
// empty prompt passed -> release the slot and send empty response
21162139
if (prompt_tokens.empty()) {
21172140
LOG_INFO("empty prompt - releasing slot", {
@@ -2127,21 +2150,6 @@ struct server_context {
21272150
continue;
21282151
}
21292152

2130-
// Roll back prompt tokens if token healing
2131-
llama_token_healing_output token_healing_out{};
2132-
if (slot.sparams.token_healing_enabled) {
2133-
token_healing_out = llama_token_healing_rollback(ctx, slot.sparams.token_healing_type,
2134-
prompt_tokens, slot.sparams.token_healing_n_rollback);
2135-
slot.n_th_prefix = token_healing_out.prefix.size();
2136-
slot.n_prompt_tokens = prompt_tokens.size();
2137-
LOG_VERBOSE("token healing prompt", {
2138-
{"id_slot", slot.id},
2139-
{"id_task", slot.id_task},
2140-
{"removed_suffix", token_healing_out.prefix},
2141-
{"n_tokens_removed", token_healing_out.n_tokens_removed}
2142-
});
2143-
}
2144-
21452153
if (slot.embedding) {
21462154
// this prompt is too large to process - discard it
21472155
if (slot.n_prompt_tokens > n_ubatch) {

0 commit comments

Comments
 (0)