Skip to content

Commit 0d0dc11

Browse files
committed
server : improve log format
1 parent 8f84210 commit 0d0dc11

File tree

1 file changed

+43
-64
lines changed

1 file changed

+43
-64
lines changed

examples/server/server.cpp

Lines changed: 43 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
#include "json-schema-to-grammar.mjs.hpp"
3232

3333
#include <atomic>
34-
#include <chrono>
3534
#include <condition_variable>
3635
#include <cstddef>
3736
#include <cinttypes>
@@ -43,10 +42,10 @@
4342
#include <unordered_map>
4443
#include <unordered_set>
4544

46-
#define SLT_INF(fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, id, id_task, __VA_ARGS__)
47-
#define SLT_WRN(fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, id, id_task, __VA_ARGS__)
48-
#define SLT_ERR(fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, id, id_task, __VA_ARGS__)
49-
#define SLT_DBG(fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, id, id_task, __VA_ARGS__)
45+
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
46+
#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
47+
#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
48+
#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
5049

5150
#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
5251
#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
@@ -210,7 +209,7 @@ struct server_slot {
210209
std::function<void(int)> callback_on_release;
211210

212211
void reset() {
213-
SLT_DBG("%s", "\n");
212+
SLT_DBG(*this, "%s", "\n");
214213

215214
n_prompt_tokens = 0;
216215
generated_text = "";
@@ -251,15 +250,15 @@ struct server_slot {
251250

252251
void add_token(const completion_token_output & token) {
253252
if (!is_processing()) {
254-
SLT_WRN("%s", "slot is not processing\n");
253+
SLT_WRN(*this, "%s", "slot is not processing\n");
255254
return;
256255
}
257256
generated_token_probs.push_back(token);
258257
}
259258

260259
void release() {
261260
if (is_processing()) {
262-
SLT_INF("stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
261+
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
263262

264263
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
265264
state = SLOT_STATE_IDLE;
@@ -316,7 +315,8 @@ struct server_slot {
316315
const double t_gen = t_token_generation / n_decoded;
317316
const double n_gen_second = 1e3 / t_token_generation * n_decoded;
318317

319-
SLT_INF("\n"
318+
SLT_INF(*this,
319+
"\n"
320320
"\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
321321
"\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
322322
"\r total time = %10.2f ms / %5d tokens\n",
@@ -694,7 +694,7 @@ struct server_context {
694694
slot.n_ctx = n_ctx_slot;
695695
slot.n_predict = params.n_predict;
696696

697-
SRV_INF("new slot, id_slot = %d, n_ctx_slot = %d\n", slot.id, slot.n_ctx);
697+
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
698698

699699
const int ga_n = params.grp_attn_n;
700700
const int ga_w = params.grp_attn_w;
@@ -705,7 +705,7 @@ struct server_context {
705705
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
706706
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
707707

708-
SRV_INF("slot self-extend: id_slot = %d, ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
708+
SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w);
709709
}
710710

711711
slot.ga_i = 0;
@@ -828,7 +828,7 @@ struct server_context {
828828
}
829829

830830
if (ret != nullptr) {
831-
SRV_DBG("selected slot by lcp similarity, id_slot = %d, max_lcp_len = %d, similarity = %f\n", ret->id, max_lcp_len, similarity);
831+
SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
832832
}
833833
}
834834

@@ -849,7 +849,7 @@ struct server_context {
849849
}
850850

851851
if (ret != nullptr) {
852-
SRV_DBG("selected slot by lru, id_slot = %d, t_last = %" PRId64 "\n", ret->id, t_last);
852+
SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last);
853853
}
854854
}
855855

@@ -914,13 +914,13 @@ struct server_context {
914914

915915
if (slot.params.cache_prompt && slot.ga_n != 1) {
916916
slot.params.cache_prompt = false;
917-
SRV_WRN("slot %d: group-attention is not supported with prompt caching. disabling cache", slot.id);
917+
SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n");
918918
}
919919

920920
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
921921
// Might be better to reject the request with a 400 ?
922922
slot.params.n_predict = slot.n_predict;
923-
SRV_WRN("slot %d: n_predict exceeds server configuration, setting to %d", slot.id, slot.n_predict);
923+
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
924924
}
925925

926926
// infill
@@ -1029,7 +1029,7 @@ struct server_context {
10291029
slot.state = SLOT_STATE_PROCESSING_PROMPT;
10301030
slot.prompt_tokens.clear();
10311031

1032-
SRV_INF("processing task, slot id = %d, task id = %d\n", slot.id, slot.id_task);
1032+
SLT_INF(slot, "%s", "processing task\n");
10331033

10341034
return true;
10351035
}
@@ -1164,54 +1164,30 @@ struct server_context {
11641164
slot.stopped_limit = true;
11651165
slot.has_next_token = false;
11661166

1167-
SRV_DBG("slot %d, task %d: stopped by limit, n_decoded = %d, n_predict = %d\n", slot.id, slot.id_task, slot.n_decoded, slot.params.n_predict);
1167+
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
11681168
}
11691169

11701170
if (llama_token_is_eog(model, result.tok)) {
11711171
slot.stopped_eos = true;
11721172
slot.has_next_token = false;
11731173

1174-
SRV_DBG("slot %d, task %d: stopped by EOS\n", slot.id, slot.id_task);
1174+
SLT_DBG(slot, "%s", "stopped by EOS\n");
11751175
}
11761176

11771177
const auto n_ctx_train = llama_n_ctx_train(model);
11781178

11791179
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
1180-
//LOG_WARNING("n_predict is not set and self-context extend is disabled."
1181-
// " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
1182-
// { "id_slot", slot.id },
1183-
// { "params.n_predict", slot.params.n_predict },
1184-
// { "slot.n_prompt_tokens", slot.n_prompt_tokens },
1185-
// { "slot.n_decoded", slot.n_decoded },
1186-
// { "slot.n_predict", slot.n_predict },
1187-
// { "n_slots", params.n_parallel },
1188-
// { "slot.n_ctx", slot.n_ctx },
1189-
// { "n_ctx", n_ctx },
1190-
// { "n_ctx_train", n_ctx_train },
1191-
// { "ga_n", slot.ga_n },
1192-
// });
11931180
slot.truncated = true;
11941181
slot.stopped_limit = true;
11951182
slot.has_next_token = false; // stop prediction
11961183

1197-
SRV_WRN("slot %d, task %d: n_predict is not set and self-context extend is disabled."
1198-
" Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop\n", slot.id, slot.id_task);
1199-
}
1200-
1201-
//LOG_VERBOSE("next token", {
1202-
// {"id_slot", slot.id},
1203-
// {"id_task", slot.id_task},
1204-
// {"token", result.tok},
1205-
// {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
1206-
// {"has_next_token", slot.has_next_token},
1207-
// {"n_remain", slot.n_remaining},
1208-
// {"n_decoded", slot.n_decoded},
1209-
// {"stopped_eos", slot.stopped_eos},
1210-
// {"stopped_word", slot.stopped_word},
1211-
// {"stopped_limit", slot.stopped_limit},
1212-
// {"stopping_word", slot.stopping_word},
1213-
//});
1214-
SRV_DBG("slot %d, task %d, n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.id, slot.id_task, slot.n_decoded, slot.n_remaining, token_str.c_str());
1184+
SLT_WRN(slot,
1185+
"n_predict (%d) is not set and self-context extend is disabled. "
1186+
"Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
1187+
slot.params.n_predict, n_ctx_train);
1188+
}
1189+
1190+
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str());
12151191

12161192
return slot.has_next_token; // continue
12171193
}
@@ -1387,7 +1363,7 @@ struct server_context {
13871363
}
13881364

13891365
if (embd == NULL) {
1390-
SRV_ERR("failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
1366+
SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
13911367

13921368
res.data = json {
13931369
{"embedding", std::vector<float>(n_embd, 0.0f)},
@@ -1404,6 +1380,8 @@ struct server_context {
14041380
};
14051381
}
14061382

1383+
SLT_DBG(slot, "%s", "sending embeddings\n");
1384+
14071385
queue_results.send(res);
14081386
}
14091387

@@ -1841,7 +1819,7 @@ struct server_context {
18411819
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
18421820
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
18431821

1844-
SRV_WRN("slot context shift, id_slot = %d, id_task = %d, n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.id_task, n_keep, n_left, n_discard);
1822+
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
18451823

18461824
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
18471825
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
@@ -1884,7 +1862,8 @@ struct server_context {
18841862
slot.cache_tokens.push_back(slot.sampled);
18851863
}
18861864

1887-
SRV_DBG("slot decode token, id_slot = %d, id_task = %d, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n", slot.id, slot.id_task, slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
1865+
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n",
1866+
slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
18881867
}
18891868

18901869
// process in chunks of params.n_batch
@@ -1905,7 +1884,7 @@ struct server_context {
19051884

19061885
// we haven't tokenized the prompt yet - do it now:
19071886
if (prompt_tokens.empty()) {
1908-
SRV_INF("tokenizing prompt, id_slot = %d, id_task = %d\n", slot.id, slot.id_task);
1887+
SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size());
19091888

19101889
slot.t_start_process_prompt = ggml_time_us();
19111890
slot.t_start_generation = 0;
@@ -1949,11 +1928,11 @@ struct server_context {
19491928
slot.n_past = 0;
19501929
slot.n_prompt_tokens = prompt_tokens.size();
19511930

1952-
SRV_INF("prompt tokenized, id_slot = %d, id_task = %d, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.id, slot.id_task, slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
1931+
SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
19531932

19541933
// empty prompt passed -> release the slot and send empty response
19551934
if (prompt_tokens.empty()) {
1956-
SRV_WRN("empty prompt - releasing slot, id_slot = %d, id_task = %d\n", slot.id, slot.id_task);
1935+
SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
19571936

19581937
slot.release();
19591938
slot.print_timings();
@@ -1995,7 +1974,7 @@ struct server_context {
19951974
slot.truncated = true;
19961975
slot.n_prompt_tokens = prompt_tokens.size();
19971976

1998-
SRV_WRN("input truncated, id_slot = %d, id_task = %d, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.id, slot.id_task, slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);
1977+
SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);
19991978

20001979
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
20011980
}
@@ -2020,7 +1999,7 @@ struct server_context {
20201999

20212000
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
20222001
// we have to evaluate at least 1 token to generate logits.
2023-
SRV_WRN("need to evaluate at least 1 token to generate logits, id_slot = %d, id_task = %d\n", slot.id, slot.id_task);
2002+
SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
20242003

20252004
slot.n_past--;
20262005
if (slot.ga_i > 0) {
@@ -2069,7 +2048,7 @@ struct server_context {
20692048
// remove the non-common part from the cache
20702049
slot.cache_tokens.resize(slot.n_past);
20712050

2072-
SRV_INF("kv cache rm [%d, end), id_slot = %d, id_task = %d\n", p0, slot.id, slot.id_task);
2051+
SLT_INF(slot, "kv cache rm [%d, end)\n", p0);
20732052

20742053
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
20752054

@@ -2105,7 +2084,7 @@ struct server_context {
21052084
// {"n_tokens", batch.n_tokens},
21062085
// {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
21072086
//});
2108-
SRV_INF("prompt processing progress, id_slot = %d, n_past = %d, n_tokens = %d, progress = %f\n", slot.id, slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
2087+
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
21092088

21102089
// entire prompt has been processed
21112090
if (slot.n_past == slot.n_prompt_tokens) {
@@ -2125,7 +2104,7 @@ struct server_context {
21252104
// {"n_ctx", n_ctx},
21262105
// {"n_tokens", batch.n_tokens},
21272106
//});
2128-
SRV_INF("prompt done, id_slot = %d, n_past = %d, n_tokens = %d\n", slot.id, slot.n_past, batch.n_tokens);
2107+
SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
21292108
}
21302109
}
21312110

@@ -2158,9 +2137,9 @@ struct server_context {
21582137
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
21592138
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
21602139

2161-
SRV_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
2162-
SRV_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
2163-
SRV_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
2140+
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
2141+
SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
2142+
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
21642143

21652144
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
21662145
llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
@@ -2170,7 +2149,7 @@ struct server_context {
21702149

21712150
slot.ga_i += slot.ga_w / slot.ga_n;
21722151

2173-
SRV_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
2152+
SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
21742153
}
21752154

21762155
slot.n_past_se += n_tokens;

0 commit comments

Comments
 (0)