22
22
from sagemaker .model import Model
23
23
from sagemaker import model_uris
24
24
from sagemaker .serve .model_server .djl_serving .prepare import prepare_djl_js_resources
25
- from sagemaker .serve .model_server .djl_serving .utils import _get_admissible_tensor_parallel_degrees
25
+ from sagemaker .serve .model_server .djl_serving .utils import _get_admissible_tensor_parallel_degrees , \
26
+ _get_admissible_dtypes
26
27
from sagemaker .serve .model_server .tgi .prepare import prepare_tgi_js_resources , _create_dir_structure
27
28
from sagemaker .serve .mode .function_pointers import Mode
28
29
from sagemaker .serve .utils .exceptions import (
42
43
_serial_benchmark ,
43
44
_concurrent_benchmark ,
44
45
_more_performant ,
45
- _pretty_print_results_djl_js ,
46
- _pretty_print_results_tgi_js
46
+ _pretty_print_benchmark_results
47
47
)
48
48
from sagemaker .serve .utils .types import ModelServer
49
49
from sagemaker .base_predictor import PredictorBase
@@ -254,169 +254,6 @@ def _build_for_tgi_jumpstart(self):
254
254
255
255
self .pysdk_model .env .update (env )
256
256
257
- def _log_delete_me (self , data : any ):
258
- """Placeholder docstring"""
259
- logger .debug ("*****************************************" )
260
- logger .debug (data )
261
- logger .debug ("*****************************************" )
262
-
263
- @_capture_telemetry ("djl_jumpstart.tune" )
264
- def tune_for_djl_jumpstart (self , max_tuning_duration : int = 1800 ):
265
- """pass"""
266
- if self .mode != Mode .LOCAL_CONTAINER :
267
- logger .warning (
268
- "Tuning is only a %s capability. Returning original model." , Mode .LOCAL_CONTAINER
269
- )
270
- return self .pysdk_model
271
-
272
- initial_model_configuration = copy .deepcopy (self .pysdk_model .env )
273
- self ._log_delete_me (f"initial_model_configuration: { initial_model_configuration } " )
274
-
275
- self ._log_delete_me (f"self.js_model_config: { self .js_model_config } " )
276
- admissible_tensor_parallel_degrees = _get_admissible_tensor_parallel_degrees (self .js_model_config )
277
- self ._log_delete_me (f"admissible_tensor_parallel_degrees: { admissible_tensor_parallel_degrees } " )
278
-
279
- benchmark_results = {}
280
- best_tuned_combination = None
281
- timeout = datetime .now () + timedelta (seconds = max_tuning_duration )
282
- for tensor_parallel_degree in admissible_tensor_parallel_degrees :
283
- self ._log_delete_me (f"tensor_parallel_degree: { tensor_parallel_degree } " )
284
- if datetime .now () > timeout :
285
- logger .info ("Max tuning duration reached. Tuning stopped." )
286
- break
287
-
288
- self .pysdk_model .env .update ({
289
- "OPTION_TENSOR_PARALLEL_DEGREE" : str (tensor_parallel_degree )
290
- })
291
-
292
- try :
293
- predictor = self .pysdk_model .deploy (
294
- model_data_download_timeout = max_tuning_duration
295
- )
296
-
297
- avg_latency , p90 , avg_tokens_per_second = _serial_benchmark (
298
- predictor , self .schema_builder .sample_input
299
- )
300
- throughput_per_second , standard_deviation = _concurrent_benchmark (
301
- predictor , self .schema_builder .sample_input
302
- )
303
-
304
- tested_env = self .pysdk_model .env .copy ()
305
- logger .info (
306
- "Average latency: %s, throughput/s: %s for configuration: %s" ,
307
- avg_latency ,
308
- throughput_per_second ,
309
- tested_env ,
310
- )
311
- benchmark_results [avg_latency ] = [
312
- tested_env ,
313
- p90 ,
314
- avg_tokens_per_second ,
315
- throughput_per_second ,
316
- standard_deviation ,
317
- ]
318
-
319
- if not best_tuned_combination :
320
- best_tuned_combination = [
321
- avg_latency ,
322
- tensor_parallel_degree ,
323
- None ,
324
- p90 ,
325
- avg_tokens_per_second ,
326
- throughput_per_second ,
327
- standard_deviation ,
328
- ]
329
- else :
330
- tuned_configuration = [
331
- avg_latency ,
332
- tensor_parallel_degree ,
333
- None ,
334
- p90 ,
335
- avg_tokens_per_second ,
336
- throughput_per_second ,
337
- standard_deviation ,
338
- ]
339
- if _more_performant (best_tuned_combination , tuned_configuration ):
340
- best_tuned_combination = tuned_configuration
341
- except LocalDeepPingException as e :
342
- logger .warning (
343
- "Deployment unsuccessful with OPTION_TENSOR_PARALLEL_DEGREE: %s. "
344
- "Failed to invoke the model server: %s" ,
345
- tensor_parallel_degree ,
346
- str (e ),
347
- )
348
- break
349
- except LocalModelOutOfMemoryException as e :
350
- logger .warning (
351
- "Deployment unsuccessful with OPTION_TENSOR_PARALLEL_DEGREE: %s. "
352
- "Out of memory when loading the model: %s" ,
353
- tensor_parallel_degree ,
354
- str (e ),
355
- )
356
- break
357
- except LocalModelInvocationException as e :
358
- logger .warning (
359
- "Deployment unsuccessful with OPTION_TENSOR_PARALLEL_DEGREE: %s. "
360
- "Failed to invoke the model server: %s"
361
- "Please check that model server configurations are as expected "
362
- "(Ex. serialization, deserialization, content_type, accept)." ,
363
- tensor_parallel_degree ,
364
- str (e ),
365
- )
366
- break
367
- except LocalModelLoadException as e :
368
- logger .warning (
369
- "Deployment unsuccessful with OPTION_TENSOR_PARALLEL_DEGREE: %s. "
370
- "Failed to load the model: %s." ,
371
- tensor_parallel_degree ,
372
- str (e ),
373
- )
374
- break
375
- except SkipTuningComboException as e :
376
- logger .warning (
377
- "Deployment with OPTION_TENSOR_PARALLEL_DEGREE: %s. "
378
- "was expected to be successful. However failed with: %s. "
379
- "Trying next combination." ,
380
- tensor_parallel_degree ,
381
- str (e ),
382
- )
383
- break
384
- except Exception :
385
- logger .exception (
386
- "Deployment unsuccessful with OPTION_TENSOR_PARALLEL_DEGREE: %s. "
387
- "with uncovered exception" ,
388
- tensor_parallel_degree
389
- )
390
- break
391
-
392
- if best_tuned_combination :
393
- self .pysdk_model .env .update ({
394
- "OPTION_TENSOR_PARALLEL_DEGREE" : str (best_tuned_combination [1 ])
395
- })
396
-
397
- _pretty_print_results_djl_js (benchmark_results )
398
- logger .info (
399
- "Model Configuration: %s was most performant with avg latency: %s, "
400
- "p90 latency: %s, average tokens per second: %s, throughput/s: %s, "
401
- "standard deviation of request %s" ,
402
- self .pysdk_model .env ,
403
- best_tuned_combination [0 ],
404
- best_tuned_combination [3 ],
405
- best_tuned_combination [4 ],
406
- best_tuned_combination [5 ],
407
- best_tuned_combination [6 ],
408
- )
409
- else :
410
- self .pysdk_model .env .update (initial_model_configuration )
411
- logger .debug (
412
- "Failed to gather any tuning results. "
413
- "Please inspect the stack trace emitted from live logging for more details. "
414
- "Falling back to default model environment variable configurations: %s" ,
415
- self .pysdk_model .env ,
416
- )
417
-
418
- return self .pysdk_model
419
-
420
257
@_capture_telemetry ("tgi_jumpstart.tune" )
421
258
def tune_for_tgi_jumpstart (self , max_tuning_duration : int = 1800 ):
422
259
"""Placeholder docstring"""
@@ -426,19 +263,13 @@ def tune_for_tgi_jumpstart(self, max_tuning_duration: int = 1800):
426
263
)
427
264
return self .pysdk_model
428
265
429
- initial_model_configuration = copy .deepcopy (self .pysdk_model .env )
430
- self ._log_delete_me (f"initial_model_configuration: { initial_model_configuration } " )
431
-
432
- self ._log_delete_me (f"self.js_model_config: { self .js_model_config } " )
433
-
266
+ initial_env_vars = copy .deepcopy (self .pysdk_model .env )
434
267
admissible_tensor_parallel_degrees = _get_admissible_tensor_parallel_degrees (self .js_model_config )
435
- self ._log_delete_me (f"admissible_tensor_parallel_degrees: { admissible_tensor_parallel_degrees } " )
436
268
437
269
benchmark_results = {}
438
270
best_tuned_combination = None
439
271
timeout = datetime .now () + timedelta (seconds = max_tuning_duration )
440
272
for tensor_parallel_degree in admissible_tensor_parallel_degrees :
441
- self ._log_delete_me (f"tensor_parallel_degree: { tensor_parallel_degree } " )
442
273
if datetime .now () > timeout :
443
274
logger .info ("Max tuning duration reached. Tuning stopped." )
444
275
break
@@ -554,7 +385,9 @@ def tune_for_tgi_jumpstart(self, max_tuning_duration: int = 1800):
554
385
"SM_NUM_GPUS" : str (best_tuned_combination [1 ])
555
386
})
556
387
557
- _pretty_print_results_tgi_js (benchmark_results )
388
+ _pretty_print_benchmark_results (benchmark_results , [
389
+ "SM_NUM_GPUS"
390
+ ])
558
391
logger .info (
559
392
"Model Configuration: %s was most performant with avg latency: %s, "
560
393
"p90 latency: %s, average tokens per second: %s, throughput/s: %s, "
@@ -567,7 +400,7 @@ def tune_for_tgi_jumpstart(self, max_tuning_duration: int = 1800):
567
400
best_tuned_combination [6 ],
568
401
)
569
402
else :
570
- self .pysdk_model .env .update (initial_model_configuration )
403
+ self .pysdk_model .env .update (initial_env_vars )
571
404
logger .debug (
572
405
"Failed to gather any tuning results. "
573
406
"Please inspect the stack trace emitted from live logging for more details. "
@@ -612,6 +445,4 @@ def _build_for_jumpstart(self):
612
445
613
446
if self .model_server == ModelServer .TGI :
614
447
self .pysdk_model .tune = self .tune_for_tgi_jumpstart
615
- elif self .model_server == ModelServer .DJL_SERVING :
616
- self .pysdk_model .tune = self .tune_for_djl_jumpstart
617
448
return self .pysdk_model
0 commit comments