@@ -1559,7 +1559,8 @@ struct llama_server_context
1559
1559
1560
1560
if (!slot.params .cache_prompt )
1561
1561
{
1562
- std::fill (slot.ctx_sampling ->prev .begin (), slot.ctx_sampling ->prev .end (), 0 );
1562
+ llama_sampling_reset (slot.ctx_sampling );
1563
+
1563
1564
slot.n_past = 0 ;
1564
1565
slot.num_prompt_tokens_processed = slot.num_prompt_tokens ;
1565
1566
}
@@ -1570,16 +1571,17 @@ struct llama_server_context
1570
1571
slot.params .n_keep = slot.num_prompt_tokens ;
1571
1572
}
1572
1573
slot.params .n_keep = std::min (slot.n_ctx - 4 , slot.params .n_keep );
1573
- // if input prompt is too big, truncate like normal
1574
+
1575
+ // if input prompt is too big, truncate it
1574
1576
if (slot.num_prompt_tokens >= slot.n_ctx )
1575
1577
{
1576
- // applied bug of #3661
1577
1578
const int n_left = slot.n_ctx - slot.params .n_keep ;
1578
1579
const int n_block_size = n_left / 2 ;
1579
1580
const int erased_blocks = (slot.num_prompt_tokens - slot.params .n_keep - n_block_size) / n_block_size;
1581
+
1580
1582
std::vector<llama_token> new_tokens (prompt_tokens.begin (), prompt_tokens.begin () + slot.params .n_keep );
1581
- // Use half the left-over space in the context for the prompt
1582
1583
new_tokens.insert (new_tokens.end (), prompt_tokens.begin () + slot.params .n_keep + erased_blocks * n_block_size, prompt_tokens.end ());
1584
+
1583
1585
LOG_VERBOSE (" input truncated" , {
1584
1586
{" n_ctx" , slot.n_ctx },
1585
1587
{" n_keep" , slot.params .n_keep },
@@ -1588,14 +1590,20 @@ struct llama_server_context
1588
1590
});
1589
1591
slot.truncated = true ;
1590
1592
prompt_tokens = new_tokens;
1593
+
1591
1594
slot.num_prompt_tokens = prompt_tokens.size ();
1592
1595
GGML_ASSERT (slot.num_prompt_tokens < slot.n_ctx );
1593
1596
}
1594
- const size_t ps = slot.num_prompt_tokens ;
1595
- std::fill (slot.ctx_sampling ->prev .begin (), slot.ctx_sampling ->prev .end () - ps, 0 );
1596
- std::copy (prompt_tokens.begin (), prompt_tokens.end (), slot.ctx_sampling ->prev .end () - ps);
1597
+
1598
+ // push the prompt into the sampling context (do not apply grammar)
1599
+ for (auto &token : prompt_tokens)
1600
+ {
1601
+ llama_sampling_accept (slot.ctx_sampling , ctx, token, false );
1602
+ }
1603
+
1597
1604
slot.n_past = common_part (slot.cache_tokens , prompt_tokens);
1598
1605
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past ;
1606
+
1599
1607
LOG_TEE (" slot %d : in cache: %i tokens | to process: %i tokens\n " , slot.id , slot.n_past , slot.num_prompt_tokens_processed );
1600
1608
}
1601
1609
0 commit comments