Skip to content

Commit 778c070

Browse files
committed
server : logs + minor code style
1 parent 5d540e8 commit 778c070

File tree

1 file changed

+62
-50
lines changed

1 file changed

+62
-50
lines changed

examples/server/server.cpp

Lines changed: 62 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -614,20 +614,24 @@ struct llama_server_context
614614

615615
// create slots
616616
all_slots_are_idle = true;
617-
if(max_ctx_per_slot == -1) {
617+
if (max_ctx_per_slot == -1)
618+
{
618619
max_ctx_per_slot = n_ctx / params.n_parallel; // split context
619620
}
620-
if(max_ctx_per_slot * params.n_parallel > n_ctx) {
621+
if (max_ctx_per_slot * params.n_parallel > n_ctx)
622+
{
621623
printf("Error: The max context per slot is more greater than model context size");
622624
return;
623625
}
626+
624627
LOG_TEE("Available slots:\n");
625628
for (int i = 0; i < params.n_parallel; i++)
626629
{
627630
llama_client_slot slot;
628631
slot.id = i;
629632
slot.max_context_size = max_ctx_per_slot;
630633
slot.reset();
634+
631635
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, max_ctx_per_slot);
632636
slots.push_back(slot);
633637
}
@@ -788,7 +792,7 @@ struct llama_server_context
788792
}
789793
}
790794

791-
if(multimodal)
795+
if (multimodal)
792796
{
793797
const auto &images_data = data.find("image_data");
794798
if (images_data != data.end() && images_data->is_array())
@@ -1068,10 +1072,10 @@ struct llama_server_context
10681072
slot.has_next_token = false;
10691073
}
10701074

1071-
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(ctx)){
1072-
slot.stopped_eos = true;
1073-
slot.has_next_token = false;
1074-
LOG_VERBOSE("eos token found", {});
1075+
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(ctx)) {
1076+
slot.stopped_eos = true;
1077+
slot.has_next_token = false;
1078+
LOG_VERBOSE("eos token found", {});
10751079
}
10761080

10771081
LOG_VERBOSE("next token", {
@@ -1277,22 +1281,25 @@ struct llama_server_context
12771281
}
12781282

12791283
task_result next_result(int task_id) {
1280-
while(true) {
1284+
while (true) {
12811285
std::this_thread::sleep_for(std::chrono::microseconds(5));
12821286
std::lock_guard<std::mutex> lock(mutex_results);
1283-
if(queue_results.empty()) {
1287+
1288+
if (queue_results.empty()) {
12841289
continue;
12851290
}
12861291

1287-
for(int i = 0; i < queue_results.size(); i++) {
1288-
if(queue_results[i].id == task_id) {
1292+
for (int i = 0; i < (int) queue_results.size(); i++) {
1293+
if (queue_results[i].id == task_id) {
12891294
task_result res = queue_results[i];
12901295
queue_results.erase(queue_results.begin() + i);
12911296
return res;
12921297
}
12931298
}
12941299
}
1295-
return task_result{-1, false, false, {}};
1300+
1301+
// never reached
1302+
//return task_result{-1, false, false, {}};
12961303
}
12971304

12981305
// for multiple images processing
@@ -1373,48 +1380,48 @@ struct llama_server_context
13731380

13741381
void process_tasks() {
13751382
std::lock_guard<std::mutex> lock(mutex_tasks);
1376-
while(!queue_tasks.empty()) {
1383+
while (!queue_tasks.empty()) {
13771384
task_server task = queue_tasks.front();
13781385
queue_tasks.erase(queue_tasks.begin());
13791386
switch (task.type)
13801387
{
1381-
case COMPLETION_TASK: { // perform completion task
1382-
llama_client_slot* slot = get_slot(json_value(task.data, "slot_id", -1));
1383-
if (slot == nullptr) {
1384-
LOG_TEE("slot unavailable\n");
1385-
// send error result
1386-
send_error(task.id, "slot unavaliable");
1387-
return;
1388-
}
1388+
case COMPLETION_TASK: {
1389+
llama_client_slot* slot = get_slot(json_value(task.data, "slot_id", -1));
1390+
if (slot == nullptr) {
1391+
LOG_TEE("slot unavailable\n");
1392+
// send error result
1393+
send_error(task.id, "slot unavaliable");
1394+
return;
1395+
}
13891396

1390-
if (task.data.contains("system_prompt")) {
1391-
process_system_prompt_data(task.data["system_prompt"]);
1392-
}
1397+
if (task.data.contains("system_prompt")) {
1398+
process_system_prompt_data(task.data["system_prompt"]);
1399+
}
13931400

1394-
slot->reset();
1401+
slot->reset();
13951402

1396-
slot->infill = task.infill_mode;
1397-
slot->task_id = task.id;
1403+
slot->infill = task.infill_mode;
1404+
slot->task_id = task.id;
13981405

1399-
if (!launch_slot_with_data(slot, task.data))
1400-
{
1401-
// send error result
1402-
send_error(task.id, "internal_error");
1403-
break;
1404-
}
1405-
}
1406-
case CANCEL_TASK: { // release slot linked with the task id
1407-
for(auto & slot : slots) {
1408-
if(slot.task_id == task.target_id) {
1409-
slot.release();
1406+
if (!launch_slot_with_data(slot, task.data))
1407+
{
1408+
// send error result
1409+
send_error(task.id, "internal_error");
14101410
break;
14111411
}
14121412
}
1413-
}
1414-
break;
1413+
case CANCEL_TASK: { // release slot linked with the task id
1414+
for (auto & slot : slots) {
1415+
if (slot.task_id == task.target_id) {
1416+
slot.release();
1417+
break;
1418+
}
1419+
}
1420+
}
1421+
break;
14151422

1416-
default:
1417-
break;
1423+
default:
1424+
break;
14181425
}
14191426
}
14201427
}
@@ -1426,6 +1433,7 @@ struct llama_server_context
14261433
// update the system prompt wait until all slots are idle state
14271434
if (need_update_system_prompt)
14281435
{
1436+
LOG_TEE("updating system prompt\n");
14291437
update_system_prompt();
14301438
}
14311439

@@ -1435,6 +1443,7 @@ struct llama_server_context
14351443
{
14361444
if (system_prompt.empty() && clean_kv_cache)
14371445
{
1446+
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
14381447
kv_cache_clear();
14391448
}
14401449
// avoid 100% usage of cpu all time
@@ -1449,6 +1458,7 @@ struct llama_server_context
14491458
const int n_left = slot.n_past - slot.params.n_keep - 1;
14501459
const int n_discard = n_left / 2;
14511460

1461+
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
14521462
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
14531463
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
14541464

@@ -1463,7 +1473,7 @@ struct llama_server_context
14631473

14641474
slot.truncated = true;
14651475

1466-
LOG_VERBOSE("input truncated", {
1476+
LOG_VERBOSE("context shift", {
14671477
{"n_ctx", n_ctx},
14681478
{"n_keep", params.n_keep},
14691479
{"n_left", n_left},
@@ -1478,7 +1488,7 @@ struct llama_server_context
14781488
if (slot.state == PROCESSING && slot.command == RELEASE)
14791489
{
14801490
slot.state = slot.params.cache_prompt ? SLEEPING : IDLE;
1481-
if(slot.state == SLEEPING) {
1491+
if (slot.state == SLEEPING) {
14821492
LOG_TEE("slot %i has %i tokens in cache.\n", slot.id, (int) slot.cache_tokens.size());
14831493
}
14841494
else
@@ -1504,6 +1514,7 @@ struct llama_server_context
15041514
slot.n_decoded += 1;
15051515
slot.n_past += 1;
15061516
}
1517+
15071518
// process in chunks of params.n_batch
15081519
int32_t n_batch = params.n_batch;
15091520

@@ -1547,7 +1558,7 @@ struct llama_server_context
15471558

15481559
slot.num_prompt_tokens = prompt_tokens.size();
15491560

1550-
if(!slot.params.cache_prompt)
1561+
if (!slot.params.cache_prompt)
15511562
{
15521563
std::fill(slot.ctx_sampling->prev.begin(), slot.ctx_sampling->prev.end(), 0);
15531564
slot.n_past = 0;
@@ -1586,17 +1597,18 @@ struct llama_server_context
15861597
std::copy(prompt_tokens.begin(), prompt_tokens.end(), slot.ctx_sampling->prev.end() - ps);
15871598
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
15881599
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
1589-
LOG_TEE("slot %i - in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
1600+
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
15901601
}
15911602

1603+
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, num_tokens_system + slot.n_past);
15921604
llama_kv_cache_seq_rm(ctx, slot.id, num_tokens_system + slot.n_past, -1);
15931605

15941606
slot.cache_tokens = prompt_tokens;
15951607

15961608
if (slot.n_past == (int) slot.num_prompt_tokens)
15971609
{
15981610
// we have to evaluate at least 1 token to generate logits.
1599-
printf("we have to evaluate at least 1 token to generate logits\n");
1611+
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
16001612
slot.n_past--;
16011613
}
16021614

@@ -1606,7 +1618,7 @@ struct llama_server_context
16061618
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
16071619
});
16081620

1609-
const bool has_images = process_images(slot); // has images?
1621+
const bool has_images = process_images(slot);
16101622

16111623
// process the prefix of first image
16121624
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
@@ -1664,7 +1676,7 @@ struct llama_server_context
16641676
return false;
16651677
}
16661678

1667-
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
1679+
LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
16681680

16691681
// retry with half the batch size to try to find a free slot in the KV cache
16701682
n_batch /= 2;
@@ -1705,7 +1717,7 @@ struct llama_server_context
17051717
const int32_t n_probs = slot.sparams.n_probs;
17061718
if (slot.sparams.temp <= 0 && n_probs > 0)
17071719
{
1708-
// For llama_sample_token_greedy we need to sort candidates
1720+
// for llama_sample_token_greedy we need to sort candidates
17091721
llama_sample_softmax(ctx, &cur_p);
17101722
}
17111723

0 commit comments

Comments
 (0)