@@ -189,12 +189,12 @@ def main(args_in: list[str] | None = None) -> None:
189
189
"pp" : {
190
190
"p95" : round (data ['metrics' ]["llamacpp_prompt_processing_second" ]["p(95)" ], 2 ),
191
191
"avg" : round (data ['metrics' ]["llamacpp_prompt_processing_second" ]["avg" ], 2 ),
192
- "0" : round (mean (prometheus_metrics ['prompt_tokens_seconds' ]), 2 ),
192
+ "0" : round (mean (prometheus_metrics ['prompt_tokens_seconds' ]), 2 ) if 'prompt_tokens_seconds' in prometheus_metrics else 0 ,
193
193
},
194
194
"tg" : {
195
195
"p95" : round (data ['metrics' ]["llamacpp_tokens_second" ]["p(95)" ], 2 ),
196
196
"avg" : round (data ['metrics' ]["llamacpp_tokens_second" ]["avg" ], 2 ),
197
- "0" : round (mean (prometheus_metrics ['predicted_tokens_seconds' ]), 2 ),
197
+ "0" : round (mean (prometheus_metrics ['predicted_tokens_seconds' ]), 2 ) if 'predicted_tokens_seconds' in prometheus_metrics else 0 ,
198
198
},
199
199
}
200
200
with open ("results.github.env" , 'a' ) as github_env :
@@ -214,11 +214,14 @@ def start_benchmark(args):
214
214
k6_args = [
215
215
'run' , args .scenario ,
216
216
'--no-color' ,
217
+ '--no-connection-reuse' ,
218
+ '--no-vu-connection-reuse' ,
217
219
]
218
220
k6_args .extend (['--duration' , args .duration ])
219
221
k6_args .extend (['--iterations' , args .n_prompts ])
220
222
k6_args .extend (['--vus' , args .parallel ])
221
223
k6_args .extend (['--summary-export' , 'k6-results.json' ])
224
+ k6_args .extend (['--out' , 'csv=k6-results.csv' ])
222
225
args = f"SERVER_BENCH_N_PROMPTS={ args .n_prompts } SERVER_BENCH_MAX_PROMPT_TOKENS={ args .max_prompt_tokens } SERVER_BENCH_MAX_CONTEXT={ args .max_tokens } "
223
226
args = args + ' ' .join ([str (arg ) for arg in [k6_path , * k6_args ]])
224
227
print (f"bench: starting k6 with: { args } " )
@@ -231,7 +234,7 @@ def start_server(args):
231
234
server_process = start_server_background (args )
232
235
233
236
attempts = 0
234
- max_attempts = 20
237
+ max_attempts = 600
235
238
if 'GITHUB_ACTIONS' in os .environ :
236
239
max_attempts *= 2
237
240
@@ -242,7 +245,15 @@ def start_server(args):
242
245
print (f"bench: waiting for server to start ..." )
243
246
time .sleep (0.5 )
244
247
245
- print ("bench: server started." )
248
+ attempts = 0
249
+ while not is_server_ready (args .host , args .port ):
250
+ attempts += 1
251
+ if attempts > max_attempts :
252
+ assert False , "server not ready"
253
+ print (f"bench: waiting for server to be ready ..." )
254
+ time .sleep (0.5 )
255
+
256
+ print ("bench: server started and ready." )
246
257
return server_process
247
258
248
259
@@ -255,11 +266,6 @@ def start_server_background(args):
255
266
'--host' , args .host ,
256
267
'--port' , args .port ,
257
268
]
258
- model_file = args .model_path_prefix + os .path .sep + args .hf_file
259
- model_dir = os .path .dirname (model_file )
260
- if not os .path .exists (model_dir ):
261
- os .makedirs (model_dir )
262
- server_args .extend (['--model' , model_file ])
263
269
server_args .extend (['--hf-repo' , args .hf_repo ])
264
270
server_args .extend (['--hf-file' , args .hf_file ])
265
271
server_args .extend (['--n-gpu-layers' , args .n_gpu_layers ])
@@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
303
309
return _is_server_listening
304
310
305
311
312
+ def is_server_ready (server_fqdn , server_port ):
313
+ url = f"http://{ server_fqdn } :{ server_port } /health"
314
+ response = requests .get (url )
315
+ return response .status_code == 200
316
+
317
+
306
318
def escape_metric_name (metric_name ):
307
319
return re .sub ('[^A-Z0-9]' , '_' , metric_name .upper ())
308
320
0 commit comments