Skip to content

Commit fd72d2d

Browse files
phymbertggerganov
andauthored
server: tests: add truncated prompt tests, better kv cache size (#5933)
* server: tests: add truncated prompt tests, better size * server, tests : update regex --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent c2101a2 commit fd72d2d

File tree

4 files changed

+81
-23
lines changed

4 files changed

+81
-23
lines changed

examples/server/server.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,6 +1128,7 @@ struct server_context {
11281128

11291129
LOG_VERBOSE("stopped by limit", {
11301130
{"id_slot", slot.id},
1131+
{"id_task", slot.id_task},
11311132
{"n_decoded", slot.n_decoded},
11321133
{"n_predict", slot.params.n_predict},
11331134
});
@@ -1141,6 +1142,8 @@ struct server_context {
11411142
}
11421143

11431144
LOG_VERBOSE("next token", {
1145+
{"id_slot", slot.id},
1146+
{"id_task", slot.id_task},
11441147
{"token", result.tok},
11451148
{"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
11461149
{"has_next_token", slot.has_next_token},
@@ -1750,6 +1753,15 @@ struct server_context {
17501753
slot.n_past = 0;
17511754
slot.n_prompt_tokens = prompt_tokens.size();
17521755

1756+
LOG_VERBOSE("prompt tokenized", {
1757+
{"id_slot", slot.id},
1758+
{"id_task", slot.id_task},
1759+
{"n_ctx", slot.n_ctx},
1760+
{"n_keep", slot.params.n_keep},
1761+
{"n_prompt_tokens", slot.n_prompt_tokens},
1762+
{"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
1763+
});
1764+
17531765
if (slot.embedding) {
17541766
// this prompt is too large to process - discard it
17551767
if (slot.n_prompt_tokens > n_batch) {
@@ -1788,10 +1800,13 @@ struct server_context {
17881800
slot.n_prompt_tokens = prompt_tokens.size();
17891801

17901802
LOG_VERBOSE("input truncated", {
1791-
{"n_ctx", slot.n_ctx},
1792-
{"n_keep", slot.params.n_keep},
1793-
{"n_left", n_left},
1794-
{"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
1803+
{"id_slot", slot.id},
1804+
{"id_task", slot.id_task},
1805+
{"n_ctx", slot.n_ctx},
1806+
{"n_keep", slot.params.n_keep},
1807+
{"n_left", n_left},
1808+
{"n_prompt_tokens", slot.n_prompt_tokens},
1809+
{"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
17951810
});
17961811

17971812
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);

examples/server/tests/features/parallel.feature

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ Feature: Parallel
66
Given a server listening on localhost:8080
77
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
88
And 42 as server seed
9-
And 512 as batch size
10-
And 64 KV cache size
9+
And 128 as batch size
10+
And 256 KV cache size
1111
And 2 slots
1212
And continuous batching
1313
Then the server is starting
@@ -76,6 +76,7 @@ Feature: Parallel
7676
| disabled | 128 |
7777
| enabled | 64 |
7878

79+
7980
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
8081
Given a prompt:
8182
"""

examples/server/tests/features/server.feature

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,10 @@ Feature: llama.cpp server
1010
# KV Cache corresponds to the total amount of tokens
1111
# that can be stored across all independent sequences: #4130
1212
# see --ctx-size and #5568
13-
And 32 KV cache size
14-
And 512 as batch size
15-
And 1 slots
16-
And embeddings extraction
17-
And 32 server max tokens to predict
13+
And 256 KV cache size
14+
And 32 as batch size
15+
And 2 slots
16+
And 64 server max tokens to predict
1817
And prometheus compatible metrics exposed
1918
Then the server is starting
2019
Then the server is healthy
@@ -23,18 +22,35 @@ Feature: llama.cpp server
2322
Then the server is ready
2423
And all slots are idle
2524

25+
2626
Scenario Outline: Completion
2727
Given a prompt <prompt>
2828
And <n_predict> max tokens to predict
2929
And a completion request with no api error
3030
Then <n_predicted> tokens are predicted matching <re_content>
31+
And the completion is <truncated> truncated
32+
And <n_prompt> prompt tokens are processed
3133
And prometheus metrics are exposed
3234
And metric llamacpp:tokens_predicted is <n_predicted>
3335

3436
Examples: Prompts
35-
| prompt | n_predict | re_content | n_predicted |
36-
| I believe the meaning of life is | 8 | (read\|going)+ | 8 |
37-
| Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 |
37+
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
38+
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
39+
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids)+ | 46 | 64 | not |
40+
41+
Scenario: Completion prompt truncated
42+
Given a prompt:
43+
"""
44+
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
45+
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
46+
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
47+
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
48+
"""
49+
And a completion request with no api error
50+
Then 64 tokens are predicted matching fun|Annaks|popcorns
51+
And the completion is truncated
52+
And 109 prompt tokens are processed
53+
3854

3955
Scenario Outline: OAI Compatibility
4056
Given a model <model>
@@ -44,11 +60,14 @@ Feature: llama.cpp server
4460
And streaming is <enable_streaming>
4561
Given an OAI compatible chat completions request with no api error
4662
Then <n_predicted> tokens are predicted matching <re_content>
63+
And <n_prompt> prompt tokens are processed
64+
And the completion is <truncated> truncated
4765

4866
Examples: Prompts
49-
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
50-
| llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
51-
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
67+
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
68+
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
69+
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird)+ | -1 | 64 | enabled | |
70+
5271

5372
Scenario: Tokenize / Detokenize
5473
When tokenizing:

examples/server/tests/features/steps/steps.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -196,12 +196,30 @@ async def step_request_completion(context, api_error):
196196

197197
@step(u'{predicted_n:d} tokens are predicted matching {re_content}')
198198
def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
199-
assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content)
199+
context.completion = context.tasks_result.pop()
200+
assert_n_tokens_predicted(context.completion, predicted_n, re_content)
200201

201202

202203
@step(u'{predicted_n:d} tokens are predicted')
203204
def step_n_tokens_predicted(context, predicted_n):
204-
assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n)
205+
context.completion = context.tasks_result.pop()
206+
assert_n_tokens_predicted(context.completion, predicted_n)
207+
208+
209+
@step(u'the completion is truncated')
210+
def step_assert_completion_truncated(context):
211+
step_assert_completion_truncated(context, '')
212+
213+
214+
@step(u'the completion is {truncated} truncated')
215+
def step_assert_completion_truncated(context, truncated):
216+
truncated = truncated != "not"
217+
assert context.completion['truncated'] == truncated, f'{context.completion}'
218+
219+
220+
@step(u'{n_prompt:d} prompt tokens are processed')
221+
def step_impl(context, n_prompt):
222+
assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}"
205223

206224

207225
@step(u'a user prompt {user_prompt}')
@@ -722,7 +740,8 @@ async def oai_chat_completions(user_prompt,
722740
completion_response = {
723741
'content': '',
724742
'timings': {
725-
'predicted_n': 0
743+
'predicted_n': 0,
744+
'prompt_n': 0
726745
}
727746
}
728747
if async_client:
@@ -763,7 +782,8 @@ async def oai_chat_completions(user_prompt,
763782
completion_response = {
764783
'content': chat_completion_raw['choices'][0]['message'],
765784
'timings': {
766-
'predicted_n': chat_completion_raw['usage']['completion_tokens']
785+
'predicted_n': chat_completion_raw['usage']['completion_tokens'],
786+
'prompt_n': chat_completion_raw['usage']['prompt_tokens']
767787
}
768788
}
769789
else:
@@ -792,13 +812,16 @@ async def oai_chat_completions(user_prompt,
792812
if 'content' in delta:
793813
completion_response['content'] += delta['content']
794814
completion_response['timings']['predicted_n'] += 1
815+
completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
795816
else:
796817
assert len(chat_completion.choices) == 1
797818
completion_response = {
798819
'content': chat_completion.choices[0].message.content,
799820
'timings': {
800-
'predicted_n': chat_completion.usage.completion_tokens
801-
}
821+
'predicted_n': chat_completion.usage.completion_tokens,
822+
'prompt_n': chat_completion.usage.prompt_tokens
823+
},
824+
'truncated': chat_completion.choices[0].finish_reason != 'stop'
802825
}
803826
if debug:
804827
print("OAI response formatted to llama.cpp:", completion_response)

0 commit comments

Comments
 (0)