@@ -614,20 +614,24 @@ struct llama_server_context
614
614
615
615
// create slots
616
616
all_slots_are_idle = true ;
617
- if (max_ctx_per_slot == -1 ) {
617
+ if (max_ctx_per_slot == -1 )
618
+ {
618
619
max_ctx_per_slot = n_ctx / params.n_parallel ; // split context
619
620
}
620
- if (max_ctx_per_slot * params.n_parallel > n_ctx) {
621
+ if (max_ctx_per_slot * params.n_parallel > n_ctx)
622
+ {
621
623
printf (" Error: The max context per slot is more greater than model context size" );
622
624
return ;
623
625
}
626
+
624
627
LOG_TEE (" Available slots:\n " );
625
628
for (int i = 0 ; i < params.n_parallel ; i++)
626
629
{
627
630
llama_client_slot slot;
628
631
slot.id = i;
629
632
slot.max_context_size = max_ctx_per_slot;
630
633
slot.reset ();
634
+
631
635
LOG_TEE (" -> Slot %i - max context: %i\n " , slot.id , max_ctx_per_slot);
632
636
slots.push_back (slot);
633
637
}
@@ -788,7 +792,7 @@ struct llama_server_context
788
792
}
789
793
}
790
794
791
- if (multimodal)
795
+ if (multimodal)
792
796
{
793
797
const auto &images_data = data.find (" image_data" );
794
798
if (images_data != data.end () && images_data->is_array ())
@@ -1068,10 +1072,10 @@ struct llama_server_context
1068
1072
slot.has_next_token = false ;
1069
1073
}
1070
1074
1071
- if (!slot.cache_tokens .empty () && result.tok == llama_token_eos (ctx)){
1072
- slot.stopped_eos = true ;
1073
- slot.has_next_token = false ;
1074
- LOG_VERBOSE (" eos token found" , {});
1075
+ if (!slot.cache_tokens .empty () && result.tok == llama_token_eos (ctx)) {
1076
+ slot.stopped_eos = true ;
1077
+ slot.has_next_token = false ;
1078
+ LOG_VERBOSE (" eos token found" , {});
1075
1079
}
1076
1080
1077
1081
LOG_VERBOSE (" next token" , {
@@ -1277,22 +1281,25 @@ struct llama_server_context
1277
1281
}
1278
1282
1279
1283
task_result next_result (int task_id) {
1280
- while (true ) {
1284
+ while (true ) {
1281
1285
std::this_thread::sleep_for (std::chrono::microseconds (5 ));
1282
1286
std::lock_guard<std::mutex> lock (mutex_results);
1283
- if (queue_results.empty ()) {
1287
+
1288
+ if (queue_results.empty ()) {
1284
1289
continue ;
1285
1290
}
1286
1291
1287
- for (int i = 0 ; i < queue_results.size (); i++) {
1288
- if (queue_results[i].id == task_id) {
1292
+ for (int i = 0 ; i < ( int ) queue_results.size (); i++) {
1293
+ if (queue_results[i].id == task_id) {
1289
1294
task_result res = queue_results[i];
1290
1295
queue_results.erase (queue_results.begin () + i);
1291
1296
return res;
1292
1297
}
1293
1298
}
1294
1299
}
1295
- return task_result{-1 , false , false , {}};
1300
+
1301
+ // never reached
1302
+ // return task_result{-1, false, false, {}};
1296
1303
}
1297
1304
1298
1305
// for multiple images processing
@@ -1373,48 +1380,48 @@ struct llama_server_context
1373
1380
1374
1381
void process_tasks () {
1375
1382
std::lock_guard<std::mutex> lock (mutex_tasks);
1376
- while (!queue_tasks.empty ()) {
1383
+ while (!queue_tasks.empty ()) {
1377
1384
task_server task = queue_tasks.front ();
1378
1385
queue_tasks.erase (queue_tasks.begin ());
1379
1386
switch (task.type )
1380
1387
{
1381
- case COMPLETION_TASK: { // perform completion task
1382
- llama_client_slot* slot = get_slot (json_value (task.data , " slot_id" , -1 ));
1383
- if (slot == nullptr ) {
1384
- LOG_TEE (" slot unavailable\n " );
1385
- // send error result
1386
- send_error (task.id , " slot unavaliable" );
1387
- return ;
1388
- }
1388
+ case COMPLETION_TASK: {
1389
+ llama_client_slot* slot = get_slot (json_value (task.data , " slot_id" , -1 ));
1390
+ if (slot == nullptr ) {
1391
+ LOG_TEE (" slot unavailable\n " );
1392
+ // send error result
1393
+ send_error (task.id , " slot unavaliable" );
1394
+ return ;
1395
+ }
1389
1396
1390
- if (task.data .contains (" system_prompt" )) {
1391
- process_system_prompt_data (task.data [" system_prompt" ]);
1392
- }
1397
+ if (task.data .contains (" system_prompt" )) {
1398
+ process_system_prompt_data (task.data [" system_prompt" ]);
1399
+ }
1393
1400
1394
- slot->reset ();
1401
+ slot->reset ();
1395
1402
1396
- slot->infill = task.infill_mode ;
1397
- slot->task_id = task.id ;
1403
+ slot->infill = task.infill_mode ;
1404
+ slot->task_id = task.id ;
1398
1405
1399
- if (!launch_slot_with_data (slot, task.data ))
1400
- {
1401
- // send error result
1402
- send_error (task.id , " internal_error" );
1403
- break ;
1404
- }
1405
- }
1406
- case CANCEL_TASK: { // release slot linked with the task id
1407
- for (auto & slot : slots) {
1408
- if (slot.task_id == task.target_id ) {
1409
- slot.release ();
1406
+ if (!launch_slot_with_data (slot, task.data ))
1407
+ {
1408
+ // send error result
1409
+ send_error (task.id , " internal_error" );
1410
1410
break ;
1411
1411
}
1412
1412
}
1413
- }
1414
- break ;
1413
+ case CANCEL_TASK: { // release slot linked with the task id
1414
+ for (auto & slot : slots) {
1415
+ if (slot.task_id == task.target_id ) {
1416
+ slot.release ();
1417
+ break ;
1418
+ }
1419
+ }
1420
+ }
1421
+ break ;
1415
1422
1416
- default :
1417
- break ;
1423
+ default :
1424
+ break ;
1418
1425
}
1419
1426
}
1420
1427
}
@@ -1426,6 +1433,7 @@ struct llama_server_context
1426
1433
// update the system prompt wait until all slots are idle state
1427
1434
if (need_update_system_prompt)
1428
1435
{
1436
+ LOG_TEE (" updating system prompt\n " );
1429
1437
update_system_prompt ();
1430
1438
}
1431
1439
@@ -1435,6 +1443,7 @@ struct llama_server_context
1435
1443
{
1436
1444
if (system_prompt.empty () && clean_kv_cache)
1437
1445
{
1446
+ LOG_TEE (" all slots are idle and system prompt is empty, clear the KV cache\n " );
1438
1447
kv_cache_clear ();
1439
1448
}
1440
1449
// avoid 100% usage of cpu all time
@@ -1449,6 +1458,7 @@ struct llama_server_context
1449
1458
const int n_left = slot.n_past - slot.params .n_keep - 1 ;
1450
1459
const int n_discard = n_left / 2 ;
1451
1460
1461
+ LOG_TEE (" slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n " , slot.id , slot.params .n_keep , n_left, n_discard);
1452
1462
llama_kv_cache_seq_rm (ctx, slot.id , slot.params .n_keep + 1 , slot.params .n_keep + n_discard + 1 );
1453
1463
llama_kv_cache_seq_shift (ctx, slot.id , slot.params .n_keep + 1 + n_discard, slot.n_past , -n_discard);
1454
1464
@@ -1463,7 +1473,7 @@ struct llama_server_context
1463
1473
1464
1474
slot.truncated = true ;
1465
1475
1466
- LOG_VERBOSE (" input truncated " , {
1476
+ LOG_VERBOSE (" context shift " , {
1467
1477
{" n_ctx" , n_ctx},
1468
1478
{" n_keep" , params.n_keep },
1469
1479
{" n_left" , n_left},
@@ -1478,7 +1488,7 @@ struct llama_server_context
1478
1488
if (slot.state == PROCESSING && slot.command == RELEASE)
1479
1489
{
1480
1490
slot.state = slot.params .cache_prompt ? SLEEPING : IDLE;
1481
- if (slot.state == SLEEPING) {
1491
+ if (slot.state == SLEEPING) {
1482
1492
LOG_TEE (" slot %i has %i tokens in cache.\n " , slot.id , (int ) slot.cache_tokens .size ());
1483
1493
}
1484
1494
else
@@ -1504,6 +1514,7 @@ struct llama_server_context
1504
1514
slot.n_decoded += 1 ;
1505
1515
slot.n_past += 1 ;
1506
1516
}
1517
+
1507
1518
// process in chunks of params.n_batch
1508
1519
int32_t n_batch = params.n_batch ;
1509
1520
@@ -1547,7 +1558,7 @@ struct llama_server_context
1547
1558
1548
1559
slot.num_prompt_tokens = prompt_tokens.size ();
1549
1560
1550
- if (!slot.params .cache_prompt )
1561
+ if (!slot.params .cache_prompt )
1551
1562
{
1552
1563
std::fill (slot.ctx_sampling ->prev .begin (), slot.ctx_sampling ->prev .end (), 0 );
1553
1564
slot.n_past = 0 ;
@@ -1586,17 +1597,18 @@ struct llama_server_context
1586
1597
std::copy (prompt_tokens.begin (), prompt_tokens.end (), slot.ctx_sampling ->prev .end () - ps);
1587
1598
slot.n_past = common_part (slot.cache_tokens , prompt_tokens);
1588
1599
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past ;
1589
- LOG_TEE (" slot %i - in cache: %i tokens | to process: %i tokens\n " , slot.id , slot.n_past , slot.num_prompt_tokens_processed );
1600
+ LOG_TEE (" slot %d : in cache: %i tokens | to process: %i tokens\n " , slot.id , slot.n_past , slot.num_prompt_tokens_processed );
1590
1601
}
1591
1602
1603
+ LOG_TEE (" slot %d : kv cache rm - [%d, end)\n " , slot.id , num_tokens_system + slot.n_past );
1592
1604
llama_kv_cache_seq_rm (ctx, slot.id , num_tokens_system + slot.n_past , -1 );
1593
1605
1594
1606
slot.cache_tokens = prompt_tokens;
1595
1607
1596
1608
if (slot.n_past == (int ) slot.num_prompt_tokens )
1597
1609
{
1598
1610
// we have to evaluate at least 1 token to generate logits.
1599
- printf ( " we have to evaluate at least 1 token to generate logits\n " );
1611
+ LOG_TEE ( " slot %d : we have to evaluate at least 1 token to generate logits\n " , slot. id );
1600
1612
slot.n_past --;
1601
1613
}
1602
1614
@@ -1606,7 +1618,7 @@ struct llama_server_context
1606
1618
{" to_eval" , tokens_to_str (ctx, slot.cache_tokens .cbegin () + slot.n_past , slot.cache_tokens .cend ())},
1607
1619
});
1608
1620
1609
- const bool has_images = process_images (slot); // has images?
1621
+ const bool has_images = process_images (slot);
1610
1622
1611
1623
// process the prefix of first image
1612
1624
std::vector<llama_token> prefix_tokens = has_images ? tokenize (slot.images [0 ].prefix_prompt , true ) : prompt_tokens;
@@ -1664,7 +1676,7 @@ struct llama_server_context
1664
1676
return false ;
1665
1677
}
1666
1678
1667
- LOG (" %s : failed to decode the batch , retrying with n_batch = %d\n " , __func__, n_batch / 2 );
1679
+ LOG_TEE (" %s : failed to find free space in the KV cache , retrying with smaller n_batch = %d\n " , __func__, n_batch / 2 );
1668
1680
1669
1681
// retry with half the batch size to try to find a free slot in the KV cache
1670
1682
n_batch /= 2 ;
@@ -1705,7 +1717,7 @@ struct llama_server_context
1705
1717
const int32_t n_probs = slot.sparams .n_probs ;
1706
1718
if (slot.sparams .temp <= 0 && n_probs > 0 )
1707
1719
{
1708
- // For llama_sample_token_greedy we need to sort candidates
1720
+ // for llama_sample_token_greedy we need to sort candidates
1709
1721
llama_sample_softmax (ctx, &cur_p);
1710
1722
}
1711
1723
0 commit comments