@@ -2064,6 +2064,8 @@ struct server_context {
2064
2064
slot.t_start_process_prompt = ggml_time_us ();
2065
2065
slot.t_start_generation = 0 ;
2066
2066
2067
+ llama_token_healing_output token_healing_out{};
2068
+
2067
2069
if (slot.infill ) {
2068
2070
const bool add_bos = llama_should_add_bos_token (model);
2069
2071
bool suff_rm_leading_spc = true ;
@@ -2083,6 +2085,12 @@ struct server_context {
2083
2085
prefix_tokens.insert (prefix_tokens.begin (), llama_token_prefix (model));
2084
2086
suffix_tokens.insert (suffix_tokens.begin (), llama_token_suffix (model));
2085
2087
2088
+ if (slot.sparams .token_healing_enabled ) {
2089
+ // For FIM roll back only the prefix part (i.e. cursor location)
2090
+ token_healing_out = llama_token_healing_rollback (ctx, slot.sparams .token_healing_type ,
2091
+ prefix_tokens, slot.sparams .token_healing_n_rollback );
2092
+ }
2093
+
2086
2094
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
2087
2095
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
2088
2096
if (add_bos) {
@@ -2098,6 +2106,11 @@ struct server_context {
2098
2106
prompt_tokens = embd_inp;
2099
2107
} else {
2100
2108
prompt_tokens = tokenize (slot.prompt , system_prompt.empty ()); // add BOS if there isn't system prompt
2109
+
2110
+ if (slot.sparams .token_healing_enabled ) {
2111
+ token_healing_out = llama_token_healing_rollback (ctx, slot.sparams .token_healing_type ,
2112
+ prompt_tokens, slot.sparams .token_healing_n_rollback );
2113
+ }
2101
2114
}
2102
2115
2103
2116
slot.n_past = 0 ;
@@ -2112,6 +2125,16 @@ struct server_context {
2112
2125
{" prompt_tokens" , tokens_to_str (ctx, prompt_tokens.cbegin (), prompt_tokens.cend ())},
2113
2126
});
2114
2127
2128
+ if (slot.sparams .token_healing_enabled ) {
2129
+ slot.n_th_prefix = token_healing_out.prefix .size ();
2130
+ LOG_VERBOSE (" token healing prompt" , {
2131
+ {" id_slot" , slot.id },
2132
+ {" id_task" , slot.id_task },
2133
+ {" removed_suffix" , token_healing_out.prefix },
2134
+ {" n_tokens_removed" , token_healing_out.n_tokens_removed }
2135
+ });
2136
+ }
2137
+
2115
2138
// empty prompt passed -> release the slot and send empty response
2116
2139
if (prompt_tokens.empty ()) {
2117
2140
LOG_INFO (" empty prompt - releasing slot" , {
@@ -2127,21 +2150,6 @@ struct server_context {
2127
2150
continue ;
2128
2151
}
2129
2152
2130
- // Roll back prompt tokens if token healing
2131
- llama_token_healing_output token_healing_out{};
2132
- if (slot.sparams .token_healing_enabled ) {
2133
- token_healing_out = llama_token_healing_rollback (ctx, slot.sparams .token_healing_type ,
2134
- prompt_tokens, slot.sparams .token_healing_n_rollback );
2135
- slot.n_th_prefix = token_healing_out.prefix .size ();
2136
- slot.n_prompt_tokens = prompt_tokens.size ();
2137
- LOG_VERBOSE (" token healing prompt" , {
2138
- {" id_slot" , slot.id },
2139
- {" id_task" , slot.id_task },
2140
- {" removed_suffix" , token_healing_out.prefix },
2141
- {" n_tokens_removed" , token_healing_out.n_tokens_removed }
2142
- });
2143
- }
2144
-
2145
2153
if (slot.embedding ) {
2146
2154
// this prompt is too large to process - discard it
2147
2155
if (slot.n_prompt_tokens > n_ubatch) {
0 commit comments