31
31
#include " json-schema-to-grammar.mjs.hpp"
32
32
33
33
#include < atomic>
34
- #include < chrono>
35
34
#include < condition_variable>
36
35
#include < cstddef>
37
36
#include < cinttypes>
43
42
#include < unordered_map>
44
43
#include < unordered_set>
45
44
46
- #define SLT_INF (fmt, ...) LOG_INF(" slot %12.*s: id %2d | task %d | " fmt, 12 , __func__, id, id_task, __VA_ARGS__)
47
- #define SLT_WRN (fmt, ...) LOG_WRN(" slot %12.*s: id %2d | task %d | " fmt, 12 , __func__, id, id_task, __VA_ARGS__)
48
- #define SLT_ERR (fmt, ...) LOG_ERR(" slot %12.*s: id %2d | task %d | " fmt, 12 , __func__, id, id_task, __VA_ARGS__)
49
- #define SLT_DBG (fmt, ...) LOG_DBG(" slot %12.*s: id %2d | task %d | " fmt, 12 , __func__, id, id_task, __VA_ARGS__)
45
+ #define SLT_INF (slot, fmt, ...) LOG_INF(" slot %12.*s: id %2d | task %d | " fmt, 12 , __func__, (slot). id, (slot). id_task, __VA_ARGS__)
46
+ #define SLT_WRN (slot, fmt, ...) LOG_WRN(" slot %12.*s: id %2d | task %d | " fmt, 12 , __func__, (slot). id, (slot). id_task, __VA_ARGS__)
47
+ #define SLT_ERR (slot, fmt, ...) LOG_ERR(" slot %12.*s: id %2d | task %d | " fmt, 12 , __func__, (slot). id, (slot). id_task, __VA_ARGS__)
48
+ #define SLT_DBG (slot, fmt, ...) LOG_DBG(" slot %12.*s: id %2d | task %d | " fmt, 12 , __func__, (slot). id, (slot). id_task, __VA_ARGS__)
50
49
51
50
#define SRV_INF (fmt, ...) LOG_INF(" srv %12.*s: " fmt, 12 , __func__, __VA_ARGS__)
52
51
#define SRV_WRN (fmt, ...) LOG_WRN(" srv %12.*s: " fmt, 12 , __func__, __VA_ARGS__)
@@ -210,7 +209,7 @@ struct server_slot {
210
209
std::function<void (int )> callback_on_release;
211
210
212
211
void reset () {
213
- SLT_DBG (" %s" , " \n " );
212
+ SLT_DBG (* this , " %s" , " \n " );
214
213
215
214
n_prompt_tokens = 0 ;
216
215
generated_text = " " ;
@@ -251,15 +250,15 @@ struct server_slot {
251
250
252
251
void add_token (const completion_token_output & token) {
253
252
if (!is_processing ()) {
254
- SLT_WRN (" %s" , " slot is not processing\n " );
253
+ SLT_WRN (* this , " %s" , " slot is not processing\n " );
255
254
return ;
256
255
}
257
256
generated_token_probs.push_back (token);
258
257
}
259
258
260
259
void release () {
261
260
if (is_processing ()) {
262
- SLT_INF (" stop processing: n_past = %d, truncated = %d\n " , n_past, truncated);
261
+ SLT_INF (* this , " stop processing: n_past = %d, truncated = %d\n " , n_past, truncated);
263
262
264
263
t_token_generation = (ggml_time_us () - t_start_generation) / 1e3 ;
265
264
state = SLOT_STATE_IDLE;
@@ -316,7 +315,8 @@ struct server_slot {
316
315
const double t_gen = t_token_generation / n_decoded;
317
316
const double n_gen_second = 1e3 / t_token_generation * n_decoded;
318
317
319
- SLT_INF (" \n "
318
+ SLT_INF (*this ,
319
+ " \n "
320
320
" \r prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n "
321
321
" \r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n "
322
322
" \r total time = %10.2f ms / %5d tokens\n " ,
@@ -694,7 +694,7 @@ struct server_context {
694
694
slot.n_ctx = n_ctx_slot;
695
695
slot.n_predict = params.n_predict ;
696
696
697
- SRV_INF ( " new slot, id_slot = %d, n_ctx_slot = %d\n " , slot. id , slot.n_ctx );
697
+ SLT_INF (slot, " new slot n_ctx_slot = %d\n " , slot.n_ctx );
698
698
699
699
const int ga_n = params.grp_attn_n ;
700
700
const int ga_w = params.grp_attn_w ;
@@ -705,7 +705,7 @@ struct server_context {
705
705
// GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
706
706
// GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
707
707
708
- SRV_INF ( " slot self-extend: id_slot = %d, ga_n = %d, ga_w = %d\n " , slot. id , ga_n, ga_w);
708
+ SLT_INF (slot, " slot self-extend: ga_n = %d, ga_w = %d\n " , ga_n, ga_w);
709
709
}
710
710
711
711
slot.ga_i = 0 ;
@@ -828,7 +828,7 @@ struct server_context {
828
828
}
829
829
830
830
if (ret != nullptr ) {
831
- SRV_DBG ( " selected slot by lcp similarity, id_slot = %d, max_lcp_len = %d, similarity = %f\n " , ret-> id , max_lcp_len, similarity);
831
+ SLT_DBG (*ret, " selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n " , max_lcp_len, similarity);
832
832
}
833
833
}
834
834
@@ -849,7 +849,7 @@ struct server_context {
849
849
}
850
850
851
851
if (ret != nullptr ) {
852
- SRV_DBG ( " selected slot by lru, id_slot = %d, t_last = %" PRId64 " \n " , ret-> id , t_last);
852
+ SLT_DBG (*ret, " selected slot by lru, t_last = %" PRId64 " \n " , t_last);
853
853
}
854
854
}
855
855
@@ -914,13 +914,13 @@ struct server_context {
914
914
915
915
if (slot.params .cache_prompt && slot.ga_n != 1 ) {
916
916
slot.params .cache_prompt = false ;
917
- SRV_WRN ( " slot %d: group-attention is not supported with prompt caching. disabling cache" , slot. id );
917
+ SLT_WRN ( slot, " %s " , " group-attention is not supported with prompt caching. disabling cache\n " );
918
918
}
919
919
920
920
if (slot.n_predict > 0 && slot.params .n_predict > slot.n_predict ) {
921
921
// Might be better to reject the request with a 400 ?
922
922
slot.params .n_predict = slot.n_predict ;
923
- SRV_WRN ( " slot %d: n_predict exceeds server configuration, setting to %d" , slot.id , slot.n_predict );
923
+ SLT_WRN ( slot, " n_predict = %d exceeds server configuration, setting to %d" , slot.n_predict , slot.n_predict );
924
924
}
925
925
926
926
// infill
@@ -1029,7 +1029,7 @@ struct server_context {
1029
1029
slot.state = SLOT_STATE_PROCESSING_PROMPT;
1030
1030
slot.prompt_tokens .clear ();
1031
1031
1032
- SRV_INF ( " processing task, slot id = %d, task id = %d \n " , slot. id , slot. id_task );
1032
+ SLT_INF (slot, " %s " , " processing task \n " );
1033
1033
1034
1034
return true ;
1035
1035
}
@@ -1164,54 +1164,30 @@ struct server_context {
1164
1164
slot.stopped_limit = true ;
1165
1165
slot.has_next_token = false ;
1166
1166
1167
- SRV_DBG ( " slot %d, task %d: stopped by limit, n_decoded = %d, n_predict = %d\n " , slot. id , slot. id_task , slot.n_decoded , slot.params .n_predict );
1167
+ SLT_DBG ( slot, " stopped by limit, n_decoded = %d, n_predict = %d\n " , slot.n_decoded , slot.params .n_predict );
1168
1168
}
1169
1169
1170
1170
if (llama_token_is_eog (model, result.tok )) {
1171
1171
slot.stopped_eos = true ;
1172
1172
slot.has_next_token = false ;
1173
1173
1174
- SRV_DBG ( " slot %d, task %d: stopped by EOS\n " , slot. id , slot. id_task );
1174
+ SLT_DBG ( slot, " %s " , " stopped by EOS\n " );
1175
1175
}
1176
1176
1177
1177
const auto n_ctx_train = llama_n_ctx_train (model);
1178
1178
1179
1179
if (slot.params .n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
1180
- // LOG_WARNING("n_predict is not set and self-context extend is disabled."
1181
- // " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
1182
- // { "id_slot", slot.id },
1183
- // { "params.n_predict", slot.params.n_predict },
1184
- // { "slot.n_prompt_tokens", slot.n_prompt_tokens },
1185
- // { "slot.n_decoded", slot.n_decoded },
1186
- // { "slot.n_predict", slot.n_predict },
1187
- // { "n_slots", params.n_parallel },
1188
- // { "slot.n_ctx", slot.n_ctx },
1189
- // { "n_ctx", n_ctx },
1190
- // { "n_ctx_train", n_ctx_train },
1191
- // { "ga_n", slot.ga_n },
1192
- // });
1193
1180
slot.truncated = true ;
1194
1181
slot.stopped_limit = true ;
1195
1182
slot.has_next_token = false ; // stop prediction
1196
1183
1197
- SRV_WRN (" slot %d, task %d: n_predict is not set and self-context extend is disabled."
1198
- " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop\n " , slot.id , slot.id_task );
1199
- }
1200
-
1201
- // LOG_VERBOSE("next token", {
1202
- // {"id_slot", slot.id},
1203
- // {"id_task", slot.id_task},
1204
- // {"token", result.tok},
1205
- // {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
1206
- // {"has_next_token", slot.has_next_token},
1207
- // {"n_remain", slot.n_remaining},
1208
- // {"n_decoded", slot.n_decoded},
1209
- // {"stopped_eos", slot.stopped_eos},
1210
- // {"stopped_word", slot.stopped_word},
1211
- // {"stopped_limit", slot.stopped_limit},
1212
- // {"stopping_word", slot.stopping_word},
1213
- // });
1214
- SRV_DBG (" slot %d, task %d, n_decoded = %d, n_remaining = %d, next token: '%s'\n " , slot.id , slot.id_task , slot.n_decoded , slot.n_remaining , token_str.c_str ());
1184
+ SLT_WRN (slot,
1185
+ " n_predict (%d) is not set and self-context extend is disabled. "
1186
+ " Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n " ,
1187
+ slot.params .n_predict , n_ctx_train);
1188
+ }
1189
+
1190
+ SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: '%s'\n " , slot.n_decoded , slot.n_remaining , token_str.c_str ());
1215
1191
1216
1192
return slot.has_next_token ; // continue
1217
1193
}
@@ -1387,7 +1363,7 @@ struct server_context {
1387
1363
}
1388
1364
1389
1365
if (embd == NULL ) {
1390
- SRV_ERR ( " failed to get embeddings, token = %d, seq_id = %d\n " , batch.token [i], batch.seq_id [i][0 ]);
1366
+ SLT_ERR (slot, " failed to get embeddings, token = %d, seq_id = %d\n " , batch.token [i], batch.seq_id [i][0 ]);
1391
1367
1392
1368
res.data = json {
1393
1369
{" embedding" , std::vector<float >(n_embd, 0 .0f )},
@@ -1404,6 +1380,8 @@ struct server_context {
1404
1380
};
1405
1381
}
1406
1382
1383
+ SLT_DBG (slot, " %s" , " sending embeddings\n " );
1384
+
1407
1385
queue_results.send (res);
1408
1386
}
1409
1387
@@ -1841,7 +1819,7 @@ struct server_context {
1841
1819
const int n_left = (int ) system_tokens.size () + slot.n_past - n_keep;
1842
1820
const int n_discard = slot.params .n_discard ? slot.params .n_discard : (n_left / 2 );
1843
1821
1844
- SRV_WRN ( " slot context shift, id_slot = %d, id_task = %d, n_keep = %d, n_left = %d, n_discard = %d\n " , slot. id , slot. id_task , n_keep, n_left, n_discard);
1822
+ SLT_WRN (slot, " slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n " , n_keep, n_left, n_discard);
1845
1823
1846
1824
llama_kv_cache_seq_rm (ctx, slot.id + 1 , n_keep , n_keep + n_discard);
1847
1825
llama_kv_cache_seq_add (ctx, slot.id + 1 , n_keep + n_discard, system_tokens.size () + slot.n_past , -n_discard);
@@ -1884,7 +1862,8 @@ struct server_context {
1884
1862
slot.cache_tokens .push_back (slot.sampled );
1885
1863
}
1886
1864
1887
- SRV_DBG (" slot decode token, id_slot = %d, id_task = %d, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n " , slot.id , slot.id_task , slot.n_ctx , slot.n_past , (int ) system_tokens.size (), (int ) slot.cache_tokens .size (), slot.truncated );
1865
+ SLT_DBG (slot, " slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n " ,
1866
+ slot.n_ctx , slot.n_past , (int ) system_tokens.size (), (int ) slot.cache_tokens .size (), slot.truncated );
1888
1867
}
1889
1868
1890
1869
// process in chunks of params.n_batch
@@ -1905,7 +1884,7 @@ struct server_context {
1905
1884
1906
1885
// we haven't tokenized the prompt yet - do it now:
1907
1886
if (prompt_tokens.empty ()) {
1908
- SRV_INF ( " tokenizing prompt, id_slot = %d, id_task = %d \n " , slot. id , slot.id_task );
1887
+ SLT_INF (slot, " tokenizing prompt, len = %d\n " , ( int ) slot.prompt . size () );
1909
1888
1910
1889
slot.t_start_process_prompt = ggml_time_us ();
1911
1890
slot.t_start_generation = 0 ;
@@ -1949,11 +1928,11 @@ struct server_context {
1949
1928
slot.n_past = 0 ;
1950
1929
slot.n_prompt_tokens = prompt_tokens.size ();
1951
1930
1952
- SRV_INF ( " prompt tokenized, id_slot = %d, id_task = %d, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n " , slot. id , slot. id_task , slot.n_ctx , slot.params .n_keep , slot.n_prompt_tokens );
1931
+ SLT_INF (slot, " prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n " , slot.n_ctx , slot.params .n_keep , slot.n_prompt_tokens );
1953
1932
1954
1933
// empty prompt passed -> release the slot and send empty response
1955
1934
if (prompt_tokens.empty ()) {
1956
- SRV_WRN ( " empty prompt - releasing slot, id_slot = %d, id_task = %d \n " , slot. id , slot. id_task );
1935
+ SLT_WRN (slot, " %s " , " empty prompt - releasing slot\n " );
1957
1936
1958
1937
slot.release ();
1959
1938
slot.print_timings ();
@@ -1995,7 +1974,7 @@ struct server_context {
1995
1974
slot.truncated = true ;
1996
1975
slot.n_prompt_tokens = prompt_tokens.size ();
1997
1976
1998
- SRV_WRN ( " input truncated, id_slot = %d, id_task = %d, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n " , slot. id , slot. id_task , slot.n_ctx , slot.params .n_keep , n_left, slot.n_prompt_tokens );
1977
+ SLT_WRN (slot, " input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n " , slot.n_ctx , slot.params .n_keep , n_left, slot.n_prompt_tokens );
1999
1978
2000
1979
GGML_ASSERT (slot.n_prompt_tokens < slot.n_ctx );
2001
1980
}
@@ -2020,7 +1999,7 @@ struct server_context {
2020
1999
2021
2000
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0 ) {
2022
2001
// we have to evaluate at least 1 token to generate logits.
2023
- SRV_WRN ( " need to evaluate at least 1 token to generate logits, id_slot = %d, id_task = %d\n " , slot.id , slot.id_task );
2002
+ SLT_WRN (slot, " need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n " , slot.n_past , slot.n_prompt_tokens );
2024
2003
2025
2004
slot.n_past --;
2026
2005
if (slot.ga_i > 0 ) {
@@ -2069,7 +2048,7 @@ struct server_context {
2069
2048
// remove the non-common part from the cache
2070
2049
slot.cache_tokens .resize (slot.n_past );
2071
2050
2072
- SRV_INF ( " kv cache rm [%d, end), id_slot = %d, id_task = %d \n " , p0, slot. id , slot. id_task );
2051
+ SLT_INF (slot, " kv cache rm [%d, end)\n " , p0);
2073
2052
2074
2053
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past ;
2075
2054
@@ -2105,7 +2084,7 @@ struct server_context {
2105
2084
// {"n_tokens", batch.n_tokens},
2106
2085
// {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
2107
2086
// });
2108
- SRV_INF ( " prompt processing progress, id_slot = %d, n_past = %d, n_tokens = %d, progress = %f\n " , slot. id , slot.n_past , batch.n_tokens , (float ) slot.n_prompt_tokens_processed / slot.n_prompt_tokens );
2087
+ SLT_INF (slot, " prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n " , slot.n_past , batch.n_tokens , (float ) slot.n_prompt_tokens_processed / slot.n_prompt_tokens );
2109
2088
2110
2089
// entire prompt has been processed
2111
2090
if (slot.n_past == slot.n_prompt_tokens ) {
@@ -2125,7 +2104,7 @@ struct server_context {
2125
2104
// {"n_ctx", n_ctx},
2126
2105
// {"n_tokens", batch.n_tokens},
2127
2106
// });
2128
- SRV_INF ( " prompt done, id_slot = %d, n_past = %d, n_tokens = %d\n " , slot. id , slot.n_past , batch.n_tokens );
2107
+ SLT_INF (slot, " prompt done, n_past = %d, n_tokens = %d\n " , slot.n_past , batch.n_tokens );
2129
2108
}
2130
2109
}
2131
2110
@@ -2158,9 +2137,9 @@ struct server_context {
2158
2137
const int bd = (slot.ga_w / slot.ga_n ) * (slot.ga_n - 1 );
2159
2138
const int dd = (slot.ga_w / slot.ga_n ) - ib * bd - slot.ga_w ;
2160
2139
2161
- SRV_DBG ( " shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " , slot.ga_i , slot.n_past_se , ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
2162
- SRV_DBG ( " div: [%6d, %6d] / %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w , slot.ga_n , (slot.ga_i + ib * bd) / slot.ga_n , (slot.ga_i + ib * bd + slot.ga_w ) / slot.ga_n );
2163
- SRV_DBG ( " shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd + slot.ga_w , slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
2140
+ SLT_DBG (slot, " shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " , slot.ga_i , slot.n_past_se , ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
2141
+ SLT_DBG (slot, " div: [%6d, %6d] / %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w , slot.ga_n , (slot.ga_i + ib * bd) / slot.ga_n , (slot.ga_i + ib * bd + slot.ga_w ) / slot.ga_n );
2142
+ SLT_DBG (slot, " shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd + slot.ga_w , slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
2164
2143
2165
2144
llama_kv_cache_seq_add (ctx, slot.id + 1 , slot.ga_i , slot.n_past_se , ib * bd);
2166
2145
llama_kv_cache_seq_div (ctx, slot.id + 1 , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w , slot.ga_n );
@@ -2170,7 +2149,7 @@ struct server_context {
2170
2149
2171
2150
slot.ga_i += slot.ga_w / slot.ga_n ;
2172
2151
2173
- SRV_DBG ( " \n n_past_old = %d, n_past = %d, ga_i = %d\n\n " , slot.n_past_se + bd, slot.n_past_se , slot.ga_i );
2152
+ SLT_DBG (slot, " \n n_past_old = %d, n_past = %d, ga_i = %d\n\n " , slot.n_past_se + bd, slot.n_past_se , slot.ga_i );
2174
2153
}
2175
2154
2176
2155
slot.n_past_se += n_tokens;
0 commit comments