Skip to content

Commit 0f4db31

Browse files
phymbertmglambda
authored andcommitted
server: bench: minor fixes (ggml-org#10765)
* server/bench: - support openAI streaming standard output with [DONE]\n\n - export k6 raw results in csv - fix too many tcp idle connection in tcp_wait - add metric time to emit first token * server/bench: - fix when prometheus not started - wait for server to be ready before starting bench
1 parent 065e609 commit 0f4db31

File tree

3 files changed

+39
-15
lines changed

3 files changed

+39
-15
lines changed

examples/server/bench/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).
66

77
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
88

9-
Example:
9+
Example (assuming golang >= 1.21 is installed):
1010
```shell
1111
go install go.k6.io/xk6/cmd/xk6@latest
12-
xk6 build master \
12+
$GOPATH/bin/xk6 build master \
1313
--with github.com/phymbert/xk6-sse
1414
```
1515

@@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1
3333

3434
Example:
3535
```shell
36-
server --host localhost --port 8080 \
36+
llama-server --host localhost --port 8080 \
3737
--model ggml-model-q4_0.gguf \
3838
--cont-batching \
3939
--metrics \

examples/server/bench/bench.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,12 @@ def main(args_in: list[str] | None = None) -> None:
189189
"pp": {
190190
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
191191
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
192-
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
192+
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
193193
},
194194
"tg": {
195195
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
196196
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
197-
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
197+
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
198198
},
199199
}
200200
with open("results.github.env", 'a') as github_env:
@@ -214,11 +214,14 @@ def start_benchmark(args):
214214
k6_args = [
215215
'run', args.scenario,
216216
'--no-color',
217+
'--no-connection-reuse',
218+
'--no-vu-connection-reuse',
217219
]
218220
k6_args.extend(['--duration', args.duration])
219221
k6_args.extend(['--iterations', args.n_prompts])
220222
k6_args.extend(['--vus', args.parallel])
221223
k6_args.extend(['--summary-export', 'k6-results.json'])
224+
k6_args.extend(['--out', 'csv=k6-results.csv'])
222225
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
223226
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
224227
print(f"bench: starting k6 with: {args}")
@@ -231,7 +234,7 @@ def start_server(args):
231234
server_process = start_server_background(args)
232235

233236
attempts = 0
234-
max_attempts = 20
237+
max_attempts = 600
235238
if 'GITHUB_ACTIONS' in os.environ:
236239
max_attempts *= 2
237240

@@ -242,7 +245,15 @@ def start_server(args):
242245
print(f"bench: waiting for server to start ...")
243246
time.sleep(0.5)
244247

245-
print("bench: server started.")
248+
attempts = 0
249+
while not is_server_ready(args.host, args.port):
250+
attempts += 1
251+
if attempts > max_attempts:
252+
assert False, "server not ready"
253+
print(f"bench: waiting for server to be ready ...")
254+
time.sleep(0.5)
255+
256+
print("bench: server started and ready.")
246257
return server_process
247258

248259

@@ -255,11 +266,6 @@ def start_server_background(args):
255266
'--host', args.host,
256267
'--port', args.port,
257268
]
258-
model_file = args.model_path_prefix + os.path.sep + args.hf_file
259-
model_dir = os.path.dirname(model_file)
260-
if not os.path.exists(model_dir):
261-
os.makedirs(model_dir)
262-
server_args.extend(['--model', model_file])
263269
server_args.extend(['--hf-repo', args.hf_repo])
264270
server_args.extend(['--hf-file', args.hf_file])
265271
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
@@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
303309
return _is_server_listening
304310

305311

312+
def is_server_ready(server_fqdn, server_port):
313+
url = f"http://{server_fqdn}:{server_port}/health"
314+
response = requests.get(url)
315+
return response.status_code == 200
316+
317+
306318
def escape_metric_name(metric_name):
307319
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
308320

examples/server/bench/script.js

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
5656

5757
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
5858
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
59+
const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
5960

6061
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
6162
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -89,6 +90,9 @@ export default function () {
8990
],
9091
"model": model,
9192
"stream": true,
93+
"stream_options": {
94+
"include_usage": true, // False to be supported in llama.cpp server
95+
},
9296
"seed": 42,
9397
"max_tokens": max_tokens,
9498
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
@@ -105,12 +109,20 @@ export default function () {
105109
client.on('event', function (event) {
106110
if (promptEvalEndTime == null) {
107111
promptEvalEndTime = new Date()
112+
llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
113+
}
114+
115+
if (event.data === '[DONE]' || event.data === '') {
116+
return
108117
}
109118

110119
let chunk = JSON.parse(event.data)
111-
let choice = chunk.choices[0]
112-
if (choice.finish_reason) {
113-
finish_reason = choice.finish_reason
120+
121+
if (chunk.choices && chunk.choices.length > 0) {
122+
let choice = chunk.choices[0]
123+
if (choice.finish_reason) {
124+
finish_reason = choice.finish_reason
125+
}
114126
}
115127

116128
if (chunk.usage) {

0 commit comments

Comments
 (0)