Skip to content

Commit 9731134

Browse files
authored
server: tests: passkey challenge / self-extend with context shift demo (#5832)
* server: tests: add models endpoint scenario * server: /v1/models add some metadata * server: tests: add debug field in context before scenario * server: tests: download model from HF, add batch size * server: tests: add passkey test * server: tests: add group attention params * server: do not truncate prompt tokens if self-extend through group attention is enabled * server: logs: do not truncate log values * server: tests - passkey - first good working value of nga * server: tests: fix server timeout * server: tests: fix passkey, add doc, fix regex content matching, fix timeout * server: tests: fix regex content matching * server: tests: schedule slow tests on master * server: metrics: fix when no prompt processed * server: tests: self-extend add llama-2-7B and Mixtral-8x7B-v0.1 * server: tests: increase timeout for completion * server: tests: keep only the PHI-2 test * server: tests: passkey add a negative test
1 parent 4a6e2d6 commit 9731134

File tree

14 files changed

+362
-111
lines changed

14 files changed

+362
-111
lines changed

.github/workflows/server.yml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ on:
1010
pull_request:
1111
types: [opened, synchronize, reopened]
1212
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
13+
schedule:
14+
- cron: '00 0 * * *'
1315

1416
jobs:
1517
server:
@@ -70,14 +72,15 @@ jobs:
7072
run: |
7173
pip install -r examples/server/tests/requirements.txt
7274
73-
- name: Download models
74-
id: download_models
75+
- name: Tests
76+
id: server_integration_tests
7577
run: |
7678
cd examples/server/tests
77-
../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
79+
PORT=8888 ./tests.sh
7880
79-
- name: Tests
80-
id: server_integration_test
81+
- name: Slow tests
82+
id: server_integration_tests_slow
83+
if: github.event.schedule != ''
8184
run: |
8285
cd examples/server/tests
83-
PORT=8888 ./tests.sh
86+
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow

examples/server/server.cpp

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -441,8 +441,8 @@ struct llama_server_context
441441
const int ga_w = params.grp_attn_w;
442442

443443
if (ga_n != 1) {
444-
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
445-
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
444+
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
445+
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
446446
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
447447
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
448448

@@ -1709,8 +1709,8 @@ struct llama_server_context
17091709
}
17101710
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
17111711

1712-
// if input prompt is too big, truncate it
1713-
if (slot.n_prompt_tokens >= slot.n_ctx)
1712+
// if input prompt is too big, truncate it, if group attention self-extend is disabled
1713+
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
17141714
{
17151715
const int n_left = slot.n_ctx - slot.params.n_keep;
17161716
const int n_block_size = n_left / 2;
@@ -1785,9 +1785,11 @@ struct llama_server_context
17851785
}
17861786

17871787
LOG_INFO("slot progression", {
1788-
{ "slot_id", slot.id },
1789-
{ "task_id", slot.task_id },
1790-
{ "n_past", slot.n_past },
1788+
{ "slot_id", slot.id },
1789+
{ "task_id", slot.task_id },
1790+
{ "n_past", slot.n_past },
1791+
{ "n_past_se", slot.n_past_se },
1792+
{ "ga_i", slot.ga_i },
17911793
{ "n_prompt_tokens_processed", slot.n_prompt_tokens_processed }
17921794
});
17931795
}
@@ -2001,6 +2003,17 @@ struct llama_server_context
20012003
LOG_VERBOSE("slots updated", {});
20022004
return true;
20032005
}
2006+
2007+
json model_meta() {
2008+
return json{
2009+
{"vocab_type", llama_vocab_type(model)},
2010+
{"n_vocab", llama_n_vocab(model)},
2011+
{"n_ctx_train", llama_n_ctx_train(model)},
2012+
{"n_embd", llama_n_embd(model)},
2013+
{"n_params", llama_model_n_params(model)},
2014+
{"size", llama_model_size(model)},
2015+
};
2016+
}
20042017
};
20052018

20062019
static void server_print_usage(const char *argv0, const gpt_params &params,
@@ -2911,9 +2924,10 @@ int main(int argc, char **argv)
29112924
for (const auto& metric_def : metrics_def) {
29122925
std::string name = metric_def["name"];
29132926
std::string help = metric_def["help"];
2914-
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
2915-
<< "# TYPE llamacpp:" << name << " " << type << "\n"
2916-
<< "llamacpp:" << name << " " << metric_def["value"] << "\n";
2927+
auto value = json_value(metric_def, "value", 0);
2928+
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
2929+
<< "# TYPE llamacpp:" << name << " " << type << "\n"
2930+
<< "llamacpp:" << name << " " << value << "\n";
29172931
}
29182932
}
29192933

@@ -2994,6 +3008,7 @@ int main(int argc, char **argv)
29943008
state.store(SERVER_STATE_READY);
29953009
LOG_INFO("model loaded", {});
29963010
}
3011+
const auto model_meta = llama.model_meta();
29973012

29983013
if (sparams.chat_template.empty()) { // custom chat template is not supplied
29993014
// check if the template comes with the model is supported by us
@@ -3143,7 +3158,7 @@ int main(int argc, char **argv)
31433158
}
31443159
});
31453160

3146-
svr.Get("/v1/models", [&params](const httplib::Request& req, httplib::Response& res)
3161+
svr.Get("/v1/models", [&params, &model_meta](const httplib::Request& req, httplib::Response& res)
31473162
{
31483163
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
31493164
std::time_t t = std::time(0);
@@ -3152,10 +3167,11 @@ int main(int argc, char **argv)
31523167
{"object", "list"},
31533168
{"data", {
31543169
{
3155-
{"id", params.model_alias},
3156-
{"object", "model"},
3157-
{"created", t},
3158-
{"owned_by", "llamacpp"}
3170+
{"id", params.model_alias},
3171+
{"object", "model"},
3172+
{"created", t},
3173+
{"owned_by", "llamacpp"},
3174+
{"meta", model_meta}
31593175
},
31603176
}}
31613177
};

examples/server/tests/README.md

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,67 @@
11
# Server tests
22

3-
Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/):
4-
* [issues.feature](./features/issues.feature) Pending issues scenario
5-
* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
6-
* [security.feature](./features/security.feature) Security, CORS and API Key
7-
* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
3+
Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development)
4+
and [behave](https://behave.readthedocs.io/en/latest/):
5+
6+
* [issues.feature](./features/issues.feature) Pending issues scenario
7+
* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
8+
* [security.feature](./features/security.feature) Security, CORS and API Key
9+
* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
810

911
Tests target GitHub workflows job runners with 4 vCPU.
1012

11-
Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client.
13+
Requests are
14+
using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
15+
based http client.
1216

13-
Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`.
17+
Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
18+
To mitigate it, you can increase values in `n_predict`, `kv_size`.
1419

1520
### Install dependencies
21+
1622
`pip install -r requirements.txt`
1723

1824
### Run tests
25+
1926
1. Build the server
27+
2028
```shell
2129
cd ../../..
2230
mkdir build
2331
cd build
2432
cmake ../
2533
cmake --build . --target server
2634
```
27-
2. download required models:
28-
1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf`
29-
3. Start the test: `./tests.sh`
35+
36+
2. Start the test: `./tests.sh`
3037

3138
It's possible to override some scenario steps values with environment variables:
32-
- `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
33-
- `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
34-
- `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
35-
- `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
39+
40+
| variable | description |
41+
|--------------------------|------------------------------------------------------------------------------------------------|
42+
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
43+
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/server` |
44+
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
45+
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
46+
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
3647

3748
### Run @bug, @wip or @wrong_usage annotated scenario
3849

3950
Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
51+
4052
- `@bug` annotation aims to link a scenario with a GitHub issue.
4153
- `@wrong_usage` are meant to show user issue that are actually an expected behavior
4254
- `@wip` to focus on a scenario working in progress
55+
- `@slow` heavy test, disabled by default
4356

4457
To run a scenario annotated with `@bug`, start:
45-
`DEBUG=ON ./tests.sh --no-skipped --tags bug`
58+
59+
```shell
60+
DEBUG=ON ./tests.sh --no-skipped --tags bug
61+
```
4662

4763
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
64+
65+
```shell
66+
./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile"
67+
```

examples/server/tests/features/environment.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77

88

99
def before_scenario(context, scenario):
10-
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
10+
context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
11+
if context.debug:
12+
print("DEBUG=ON\n")
13+
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n")
1114
port = 8080
1215
if 'PORT' in os.environ:
1316
port = int(os.environ['PORT'])
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# List of ongoing issues
2+
# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
23
@bug
34
Feature: Issues
45
# No confirmed issue at the moment

examples/server/tests/features/parallel.feature

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
@llama.cpp
2+
@parallel
23
Feature: Parallel
34

45
Background: Server startup
56
Given a server listening on localhost:8080
6-
And a model file stories260K.gguf
7-
And a model alias tinyllama-2
7+
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
88
And 42 as server seed
9+
And 512 as batch size
910
And 64 KV cache size
1011
And 2 slots
1112
And embeddings extraction
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# run with: ./tests.sh --no-skipped --tags passkey
2+
@passkey
3+
@slow
4+
Feature: Passkey / Self-extend with context shift
5+
6+
Background: Server startup
7+
Given a server listening on localhost:8080
8+
9+
# Generates a long text of junk and inserts a secret passkey number inside it.
10+
# Then we query the LLM for the secret passkey.
11+
# see #3856 and #4810
12+
Scenario Outline: Passkey
13+
Given a model file <hf_file> from HF repo <hf_repo>
14+
And <n_batch> as batch size
15+
And <n_junk> as number of junk
16+
And <n_predicted> server max tokens to predict
17+
And 42 as seed
18+
And <n_ctx> KV cache size
19+
And 1 slots
20+
And <n_ga> group attention factor to extend context size through self-extend
21+
And <n_ga_w> group attention width to extend context size through self-extend
22+
# Can be override with N_GPU_LAYERS
23+
And <ngl> GPU offloaded layers
24+
Then the server is starting
25+
Then the server is healthy
26+
Given available models
27+
Then model 0 is trained on <n_ctx_train> tokens context
28+
Given a prefix prompt:
29+
"""
30+
here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
31+
"""
32+
And a passkey prompt template:
33+
"""
34+
The pass key is <passkey> Remember it. <passkey> is the pass key.
35+
"""
36+
And a junk suffix prompt:
37+
"""
38+
The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
39+
"""
40+
And a suffix prompt:
41+
"""
42+
What is the pass key? The pass key is
43+
"""
44+
Given a "<passkey>" passkey challenge prompt with the passkey inserted every <i_pos> junk
45+
And a completion request with no api error
46+
Then <n_predicted> tokens are predicted matching <re_content>
47+
48+
Examples:
49+
| hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content |
50+
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 4 | 512 | 250 | 50 | 42 | 1 | 42 |
51+
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 2 | 512 | 250 | 50 | 42 | 1 | \b((?!42)\w)+\b |
52+
#| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 |
53+
#| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0
54+
# 987 |
55+

examples/server/tests/features/security.feature

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
@llama.cpp
2+
@security
23
Feature: Security
34

45
Background: Server startup with an api key defined
56
Given a server listening on localhost:8080
6-
And a model file stories260K.gguf
7+
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
78
And a server api key llama.cpp
89
Then the server is starting
910
Then the server is healthy

examples/server/tests/features/server.feature

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
@llama.cpp
2+
@server
23
Feature: llama.cpp server
34

45
Background: Server startup
56
Given a server listening on localhost:8080
6-
And a model file stories260K.gguf
7+
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
78
And a model alias tinyllama-2
89
And 42 as server seed
910
# KV Cache corresponds to the total amount of tokens
1011
# that can be stored across all independent sequences: #4130
1112
# see --ctx-size and #5568
1213
And 32 KV cache size
14+
And 512 as batch size
1315
And 1 slots
1416
And embeddings extraction
1517
And 32 server max tokens to predict
@@ -29,9 +31,9 @@ Feature: llama.cpp server
2931
And prometheus metrics are exposed
3032

3133
Examples: Prompts
32-
| prompt | n_predict | re_content | n_predicted |
33-
| I believe the meaning of life is | 8 | (read<or>going)+ | 8 |
34-
| Write a joke about AI | 64 | (park<or>friends<or>scared<or>always)+ | 32 |
34+
| prompt | n_predict | re_content | n_predicted |
35+
| I believe the meaning of life is | 8 | (read\|going)+ | 8 |
36+
| Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 |
3537

3638
Scenario Outline: OAI Compatibility
3739
Given a model <model>
@@ -43,9 +45,9 @@ Feature: llama.cpp server
4345
Then <n_predicted> tokens are predicted matching <re_content>
4446

4547
Examples: Prompts
46-
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
47-
| llama-2 | Book | What is the best book | 8 | (Mom<or>what)+ | 8 | disabled |
48-
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks<or>happy<or>bird)+ | 32 | enabled |
48+
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
49+
| llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
50+
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
4951

5052
Scenario: Embedding
5153
When embeddings are computed for:
@@ -75,10 +77,15 @@ Feature: llama.cpp server
7577
When an OAI compatible embeddings computation request for multiple inputs
7678
Then embeddings are generated
7779

78-
7980
Scenario: Tokenize / Detokenize
8081
When tokenizing:
8182
"""
8283
What is the capital of France ?
8384
"""
8485
Then tokens can be detokenize
86+
87+
Scenario: Models available
88+
Given available models
89+
Then 1 models are supported
90+
Then model 0 is identified by tinyllama-2
91+
Then model 0 is trained on 128 tokens context

0 commit comments

Comments
 (0)