Skip to content

Commit 0765d9c

Browse files
committed
server: tests: add health check and concurrent request example
1 parent eb65592 commit 0765d9c

File tree

3 files changed

+151
-29
lines changed

3 files changed

+151
-29
lines changed

examples/server/tests/features/server.feature

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
Feature: llama.cpp server
22

3+
Background: The server is started and ready to accept prompts
4+
When wait for the server to be started
5+
Then wait for the server to be healthy
6+
7+
Scenario: Health endpoint
8+
Given an health liveness probe
9+
Then the server must be healthy
10+
311
Scenario Outline: run a completion request
412
Given a prompt <prompt>
513
When we request a completion
@@ -18,6 +26,14 @@ Feature: llama.cpp server
1826
Then the oai response contains completion tokens
1927

2028
Examples: Prompts
21-
| model | system_prompt | user_prompt |
22-
| tinyllama-2 | You are ChatGPT. | Say hello |
23-
| tinyllama-2 | You are a coding assistant. | Write the fibonacci function in c++ |
29+
| model | system_prompt | user_prompt |
30+
| tinyllama-2 | You are ChatGPT. | Say hello |
31+
| tinyllama-2 | You are a coding assistant. | Write the fibonacci function in c++ |
32+
33+
34+
Scenario: Health endpoint during processing with concurrent requests
35+
Given 2 slow concurrent prompts
36+
Then wait for all slots processing
37+
Then the server is overloaded
38+
When wait for all slots idle
39+
Then all prompts must be predicted
Lines changed: 129 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,105 @@
1+
import socket
2+
import threading
3+
import time
4+
from contextlib import closing
5+
16
import openai
27
import requests
3-
from behave import *
8+
from behave import step
9+
from behave.api.async_step import async_run_until_complete
10+
11+
base_fqdn = 'localhost'
12+
base_port = 8080
13+
base_url = f"http://{base_fqdn}:{base_port}"
414

515
openai.api_key = 'llama.cpp'
6-
openai.api_base = "http://localhost:8080/v1/chat"
16+
openai.api_base = f"{base_url}/v1/chat"
17+
18+
slow_prompt = 'say hello ' * 10
19+
fast_prompt = 'Write a joke'
20+
21+
n_slots = 2
22+
23+
24+
@step(u'wait for the server to be started')
25+
def step_wait_for_the_server_to_be_started(context):
26+
server_started = False
27+
while not server_started:
28+
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
29+
result = sock.connect_ex((base_fqdn, base_port))
30+
if result != 0:
31+
print("server not ready: ", base_fqdn, base_port, result)
32+
time.sleep(1)
33+
else:
34+
return 0
35+
36+
37+
@step(u'wait for the server to be healthy')
38+
def step_wait_for_the_server_to_be_healthy(context):
39+
status_code = 500
40+
while status_code != 200:
41+
status_code = requests.get(f'{base_url}/health').status_code
42+
if status_code != 200:
43+
time.sleep(1)
744

845

9-
@given(u'a prompt {prompt}')
46+
@step(u'an health liveness probe')
47+
def step_an_health_liveness_probe(context):
48+
response = requests.get(f'{base_url}/health')
49+
context.status_code = response.status_code
50+
context.response_data = response.json()
51+
52+
53+
@step(u'the server must be healthy')
54+
def step_server_healthy(context):
55+
assert context.status_code == 200
56+
assert context.response_data['status'] == 'ok'
57+
58+
59+
@step(u'the server is overloaded')
60+
@async_run_until_complete()
61+
async def step_server_overloaded(context):
62+
response = requests.get(f'{base_url}/health?fail_on_no_slot')
63+
assert response.status_code == 503
64+
assert response.json()['status'] == 'no slot available'
65+
66+
67+
@step(u'a prompt {prompt}')
1068
def step_prompt(context, prompt):
1169
context.prompt = prompt
1270

1371

14-
@when(u'we request a completion')
72+
@step(u'we request a completion')
1573
def step_request_completion(context):
16-
response = requests.post('http://localhost:8080/completion', json={
74+
response = requests.post(f'{base_url}/completion', json={
1775
"prompt": context.prompt
1876
})
1977
status_code = response.status_code
2078
assert status_code == 200
2179
context.response_data = response.json()
2280

2381

24-
@then(u'tokens are predicted')
82+
@step(u'tokens are predicted')
2583
def step_request_completion(context):
26-
assert len(context.response_data['content']) > 0
27-
assert context.response_data['timings']['predicted_n'] > 0
84+
prompt_predicted(context.response_data)
2885

2986

30-
@given(u'a user prompt {user_prompt}')
87+
@step(u'a user prompt {user_prompt}')
3188
def step_user_prompt(context, user_prompt):
3289
context.user_prompt = user_prompt
3390

3491

35-
@given(u'a system prompt {system_prompt}')
92+
@step(u'a system prompt {system_prompt}')
3693
def step_system_prompt(context, system_prompt):
3794
context.system_prompt = system_prompt
3895

3996

40-
@given(u'a model {model}')
97+
@step(u'a model {model}')
4198
def step_model(context, model):
4299
context.model = model
43100

44101

45-
@when(u'we request the oai completions endpoint')
102+
@step(u'we request the oai completions endpoint')
46103
def step_oai_completions(context):
47104
context.chat_completion = openai.Completion.create(
48105
messages=[
@@ -59,8 +116,67 @@ def step_oai_completions(context):
59116
)
60117

61118

62-
@then(u'the oai response contains completion tokens')
119+
@step(u'the oai response contains completion tokens')
63120
def step_oai_response_has_completion_tokens(context):
64121
assert len(context.chat_completion.choices) == 1
65122
assert len(context.chat_completion.choices[0].message) > 0
66123
assert context.chat_completion.usage.completion_tokens > 0
124+
125+
126+
def async_prompt(context, prompt):
127+
response = requests.post(f'{base_url}/completion', json={
128+
"prompt": prompt
129+
})
130+
131+
context.async_responses.append(response)
132+
133+
134+
@step(u'{n_prompt} {prompt_type} concurrent prompts')
135+
def step_n_concurrent_prompts(context, n_prompt, prompt_type):
136+
prompt = fast_prompt
137+
if prompt_type == 'slow':
138+
prompt = slow_prompt
139+
context.async_responses = []
140+
context.threads = []
141+
for i in range(int(n_prompt)):
142+
thread = threading.Thread(target=async_prompt, args=(context, prompt))
143+
thread.start()
144+
context.threads.append(thread)
145+
146+
147+
def wait_for_slots_processing(context, expected_slots_processing):
148+
while True:
149+
health = requests.get(f'{base_url}/health').json()
150+
if 'slots_processing' in health: # FIXME when #5594 is merged
151+
slots_processing = health['slots_processing']
152+
else:
153+
slots_processing = 0
154+
if slots_processing == expected_slots_processing:
155+
break
156+
else:
157+
time.sleep(0.2)
158+
159+
160+
@step(u'wait for all slots processing')
161+
def step_wait_for_all_slots_processing(context):
162+
wait_for_slots_processing(context, n_slots)
163+
164+
165+
@step(u'wait for all slots idle')
166+
def step_wait_for_all_slots_idle(context):
167+
wait_for_slots_processing(context, 0)
168+
169+
170+
@step(u'all prompts must be predicted')
171+
def step_all_prompts_must_be_predicted(context):
172+
for thread in context.threads:
173+
thread.join()
174+
for async_response in context.async_responses:
175+
assert async_response.status_code == 200
176+
response_data = async_response.json()
177+
prompt_predicted(response_data)
178+
179+
180+
def prompt_predicted(response_data):
181+
assert len(response_data['content']) > 0
182+
assert response_data['timings']['predicted_n'] > 0

examples/server/tests/tests.sh

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ then
66
exit 1
77
fi
88

9+
# kill the server at the end
910
cleanup() {
1011
pkill -P $$
1112
}
@@ -20,26 +21,15 @@ set -eu
2021
../../../build/bin/server \
2122
--model "$model_path" \
2223
--alias tinyllama-2 \
23-
--ctx-size 64 \
24+
--ctx-size 1024 \
2425
--parallel 2 \
25-
--n-predict 32 \
26+
--n-predict 1024 \
2627
--batch-size 32 \
2728
--threads 4 \
2829
--threads-batch 4 \
2930
--embedding \
3031
--cont-batching \
3132
"$@" &
3233

33-
# Wait for the server to start
34-
max_attempts=30
35-
attempts=${max_attempts}
36-
until curl --silent --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do
37-
attempts=$(( attempts - 1));
38-
[ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; }
39-
sleep_time=$(( (max_attempts - attempts) * 2 ))
40-
echo "waiting for server to be ready ${sleep_time}s..."
41-
sleep ${sleep_time}
42-
done
43-
4434
# Start tests
4535
behave

0 commit comments

Comments
 (0)