30
30
LocalModelOutOfMemoryException ,
31
31
LocalModelInvocationException ,
32
32
LocalModelLoadException ,
33
- SkipTuningComboException
33
+ SkipTuningComboException ,
34
34
)
35
35
from sagemaker .serve .utils .predictors import (
36
36
DjlLocalModePredictor ,
37
37
TgiLocalModePredictor ,
38
38
)
39
- from sagemaker .serve .utils .local_hardware import _get_nb_instance , _get_ram_usage_mb , _get_available_gpus
39
+ from sagemaker .serve .utils .local_hardware import (
40
+ _get_nb_instance ,
41
+ _get_ram_usage_mb ,
42
+ _get_available_gpus ,
43
+ )
40
44
from sagemaker .serve .utils .telemetry_logger import _capture_telemetry
41
45
from sagemaker .serve .utils .tuning import (
42
46
_serial_benchmark ,
43
47
_concurrent_benchmark ,
44
48
_more_performant ,
45
- _pretty_print_benchmark_results
49
+ _pretty_print_benchmark_results ,
46
50
)
47
51
from sagemaker .serve .utils .types import ModelServer
48
52
from sagemaker .base_predictor import PredictorBase
@@ -150,10 +154,7 @@ def _js_builder_deploy_wrapper(self, *args, **kwargs) -> Type[PredictorBase]:
150
154
model_data = self .pysdk_model .model_data ,
151
155
)
152
156
elif not hasattr (self , "prepared_for_tgi" ):
153
- (
154
- self .js_model_config ,
155
- self .prepared_for_tgi
156
- ) = prepare_tgi_js_resources (
157
+ (self .js_model_config , self .prepared_for_tgi ) = prepare_tgi_js_resources (
157
158
model_path = self .model_path ,
158
159
js_id = self .model ,
159
160
dependencies = self .dependencies ,
@@ -241,10 +242,7 @@ def _build_for_tgi_jumpstart(self):
241
242
env = {}
242
243
if self .mode == Mode .LOCAL_CONTAINER :
243
244
if not hasattr (self , "prepared_for_tgi" ):
244
- (
245
- self .js_model_config ,
246
- self .prepared_for_tgi
247
- ) = prepare_tgi_js_resources (
245
+ (self .js_model_config , self .prepared_for_tgi ) = prepare_tgi_js_resources (
248
246
model_path = self .model_path ,
249
247
js_id = self .model ,
250
248
dependencies = self .dependencies ,
@@ -256,28 +254,15 @@ def _build_for_tgi_jumpstart(self):
256
254
257
255
self .pysdk_model .env .update (env )
258
256
259
- def _logging_debug (self , message ):
260
- logging .debug ("**************************************" )
261
- logging .debug (message )
262
- logging .debug ("**************************************" )
263
-
264
257
def _tune_for_js (
265
- self ,
266
- num_shard_env_var : str = "SM_NUM_GPUS" ,
267
- num_model_copies_env_var : str = "SAGEMAKER_MODEL_SERVER_WORKERS" ,
268
- multiple_model_copies_enabled : bool = False ,
269
- max_tuning_duration : int = 1800 ):
270
- """Tune for Jumpstart Models
258
+ self , multiple_model_copies_enabled : bool = False , max_tuning_duration : int = 1800
259
+ ):
260
+ """Tune for Jumpstart Models.
271
261
272
262
Args:
273
- num_shard_env_var (str): The name of the environment variable specifies the
274
- number of GPUs available for training or inference. Default: ``SM_NUM_GPUS``
275
- For example SM_NUM_GPUS, or OPTION_TENSOR_PARALLEL_DEGREE.
276
- num_model_copies_env_var (str): The name of the environment variable that specifies the
277
- number of worker processes used by this model. Default: ``SAGEMAKER_MODEL_SERVER_WORKERS``
278
263
multiple_model_copies_enabled (bool): Whether multiple model copies serving is enable by
279
- the Serving container . Defaults to ``False``
280
- max_tuning_duration (int): The maximum duration to deploy this ``Model`` locally.
264
+ this ``DLC`` . Defaults to ``False``
265
+ max_tuning_duration (int): The maximum timeout to deploy this ``Model`` locally.
281
266
Default: ``1800``
282
267
returns:
283
268
Tuned Model.
@@ -288,21 +273,19 @@ def _tune_for_js(
288
273
)
289
274
return self .pysdk_model
290
275
291
- initial_env_vars = copy .deepcopy (self .pysdk_model .env )
292
- admissible_tensor_parallel_degrees = _get_admissible_tensor_parallel_degrees (self .js_model_config )
293
-
294
- self ._logging_debug (
295
- f"initial_env_vars: { initial_env_vars } ,"
296
- f" admissible_tensor_parallel_degrees: { admissible_tensor_parallel_degrees } " )
276
+ num_shard_env_var_name = "SM_NUM_GPUS"
277
+ if "OPTION_TENSOR_PARALLEL_DEGREE" in self .pysdk_model .env .keys ():
278
+ num_shard_env_var_name = "OPTION_TENSOR_PARALLEL_DEGREE"
297
279
280
+ initial_env_vars = copy .deepcopy (self .pysdk_model .env )
281
+ admissible_tensor_parallel_degrees = _get_admissible_tensor_parallel_degrees (
282
+ self .js_model_config
283
+ )
298
284
available_gpus = _get_available_gpus () if multiple_model_copies_enabled else None
299
- self ._logging_debug (
300
- f"multiple_model_copies_enabled: { multiple_model_copies_enabled } , available_gpus: { available_gpus } " )
301
285
302
286
benchmark_results = {}
303
287
best_tuned_combination = None
304
288
timeout = datetime .now () + timedelta (seconds = max_tuning_duration )
305
-
306
289
for tensor_parallel_degree in admissible_tensor_parallel_degrees :
307
290
if datetime .now () > timeout :
308
291
logger .info ("Max tuning duration reached. Tuning stopped." )
@@ -312,25 +295,15 @@ def _tune_for_js(
312
295
if multiple_model_copies_enabled :
313
296
sagemaker_model_server_workers = int (available_gpus / tensor_parallel_degree )
314
297
315
- self .pysdk_model .env .update ({
316
- num_shard_env_var : str (tensor_parallel_degree ),
317
- num_model_copies_env_var : str (sagemaker_model_server_workers )
318
- })
319
-
320
- self ._logging_debug (
321
- f"num_shard_env_var: { num_shard_env_var } , tensor_parallel_degree: { tensor_parallel_degree } " )
322
- self ._logging_debug (f"sagemaker_model_server_workers: { sagemaker_model_server_workers } " )
323
- self ._logging_debug (
324
- f"num_model_copies_env_var: { num_model_copies_env_var } , "
325
- f"sagemaker_model_server_workers: { sagemaker_model_server_workers } " )
326
- self ._logging_debug (f"current model.env: { self .pysdk_model .env } " )
298
+ self .pysdk_model .env .update (
299
+ {
300
+ num_shard_env_var_name : str (tensor_parallel_degree ),
301
+ "SAGEMAKER_MODEL_SERVER_WORKERS" : str (sagemaker_model_server_workers ),
302
+ }
303
+ )
327
304
328
305
try :
329
- self ._logging_debug (f"Deploying model with env config { self .pysdk_model .env } " )
330
- predictor = self .pysdk_model .deploy (
331
- model_data_download_timeout = max_tuning_duration
332
- )
333
- self ._logging_debug (f"Deployed model with env config { self .pysdk_model .env } " )
306
+ predictor = self .pysdk_model .deploy (model_data_download_timeout = max_tuning_duration )
334
307
335
308
avg_latency , p90 , avg_tokens_per_second = _serial_benchmark (
336
309
predictor , self .schema_builder .sample_input
@@ -378,91 +351,77 @@ def _tune_for_js(
378
351
best_tuned_combination = tuned_configuration
379
352
except LocalDeepPingException as e :
380
353
logger .warning (
381
- "Deployment unsuccessful with %s: %s. %s : %s"
354
+ "Deployment unsuccessful with %s: %s. SAGEMAKER_MODEL_SERVER_WORKERS : %s"
382
355
"Failed to invoke the model server: %s" ,
383
- num_shard_env_var ,
356
+ num_shard_env_var_name ,
384
357
tensor_parallel_degree ,
385
- num_model_copies_env_var ,
386
358
sagemaker_model_server_workers ,
387
359
str (e ),
388
360
)
389
361
break
390
362
except LocalModelOutOfMemoryException as e :
391
363
logger .warning (
392
- f "Deployment unsuccessful with %s: %s. %s : %s. "
364
+ "Deployment unsuccessful with %s: %s. SAGEMAKER_MODEL_SERVER_WORKERS : %s. "
393
365
"Out of memory when loading the model: %s" ,
394
- num_shard_env_var ,
366
+ num_shard_env_var_name ,
395
367
tensor_parallel_degree ,
396
- num_model_copies_env_var ,
397
368
sagemaker_model_server_workers ,
398
369
str (e ),
399
370
)
400
371
break
401
372
except LocalModelInvocationException as e :
402
373
logger .warning (
403
- f "Deployment unsuccessful with %s: %s. %s : %s. "
374
+ "Deployment unsuccessful with %s: %s. SAGEMAKER_MODEL_SERVER_WORKERS : %s. "
404
375
"Failed to invoke the model server: %s"
405
376
"Please check that model server configurations are as expected "
406
377
"(Ex. serialization, deserialization, content_type, accept)." ,
407
- num_shard_env_var ,
378
+ num_shard_env_var_name ,
408
379
tensor_parallel_degree ,
409
- num_model_copies_env_var ,
410
380
sagemaker_model_server_workers ,
411
381
str (e ),
412
382
)
413
383
break
414
384
except LocalModelLoadException as e :
415
385
logger .warning (
416
- f "Deployment unsuccessful with %s: %s. %s : %s. "
386
+ "Deployment unsuccessful with %s: %s. SAGEMAKER_MODEL_SERVER_WORKERS : %s. "
417
387
"Failed to load the model: %s." ,
418
- num_shard_env_var ,
388
+ num_shard_env_var_name ,
419
389
tensor_parallel_degree ,
420
- num_model_copies_env_var ,
421
390
sagemaker_model_server_workers ,
422
391
str (e ),
423
392
)
424
393
break
425
394
except SkipTuningComboException as e :
426
395
logger .warning (
427
- f "Deployment with %s: %s. %s : %s. "
396
+ "Deployment with %s: %s. SAGEMAKER_MODEL_SERVER_WORKERS : %s. "
428
397
"was expected to be successful. However failed with: %s. "
429
398
"Trying next combination." ,
430
- num_shard_env_var ,
399
+ num_shard_env_var_name ,
431
400
tensor_parallel_degree ,
432
- num_model_copies_env_var ,
433
401
sagemaker_model_server_workers ,
434
402
str (e ),
435
403
)
436
404
break
437
- except Exception as e :
438
- logger .exception (e )
405
+ except Exception : # pylint: disable=W0703
439
406
logger .exception (
440
- f "Deployment unsuccessful with %s: %s. %s : %s. "
407
+ "Deployment unsuccessful with %s: %s. SAGEMAKER_MODEL_SERVER_WORKERS : %s. "
441
408
"with uncovered exception" ,
442
- num_shard_env_var ,
409
+ num_shard_env_var_name ,
443
410
tensor_parallel_degree ,
444
- num_model_copies_env_var ,
445
411
sagemaker_model_server_workers ,
446
412
)
447
413
break
448
414
449
415
if best_tuned_combination :
450
- model_env_vars = [num_shard_env_var ]
451
-
452
- self .pysdk_model .env .update ({
453
- num_shard_env_var : str (best_tuned_combination [1 ])
454
- })
455
- if multiple_model_copies_enabled :
456
- self .pysdk_model .env .update ({
457
- num_model_copies_env_var : str (best_tuned_combination [2 ])
458
- })
459
- model_env_vars .append (num_model_copies_env_var )
460
-
461
- self ._logging_debug (f"benchmark_results: { benchmark_results } , model_env_vars: { model_env_vars } " )
416
+ self .pysdk_model .env .update (
417
+ {
418
+ num_shard_env_var_name : str (best_tuned_combination [1 ]),
419
+ "SAGEMAKER_MODEL_SERVER_WORKERS" : str (best_tuned_combination [2 ]),
420
+ }
421
+ )
462
422
463
423
_pretty_print_benchmark_results (
464
- benchmark_results ,
465
- model_env_vars
424
+ benchmark_results , [num_shard_env_var_name , "SAGEMAKER_MODEL_SERVER_WORKERS" ]
466
425
)
467
426
logger .info (
468
427
"Model Configuration: %s was most performant with avg latency: %s, "
@@ -480,32 +439,26 @@ def _tune_for_js(
480
439
logger .debug (
481
440
"Failed to gather any tuning results. "
482
441
"Please inspect the stack trace emitted from live logging for more details. "
483
- "Falling back to default model environment variable configurations: %s" ,
442
+ "Falling back to default model configurations: %s" ,
484
443
self .pysdk_model .env ,
485
444
)
486
445
487
446
return self .pysdk_model
488
447
489
448
@_capture_telemetry ("djl_jumpstart.tune" )
490
449
def tune_for_djl_jumpstart (self , max_tuning_duration : int = 1800 ):
491
- """Tune for Jumpstart Models with DJL serving"""
492
- num_shard_env_var = "SM_NUM_GPUS"
493
- if "OPTION_TENSOR_PARALLEL_DEGREE" in self .pysdk_model .env .keys ():
494
- num_shard_env_var = "OPTION_TENSOR_PARALLEL_DEGREE"
495
-
450
+ """Tune for Jumpstart Models with DJL DLC"""
496
451
return self ._tune_for_js (
497
- num_shard_env_var = num_shard_env_var ,
498
- multiple_model_copies_enabled = True ,
499
- max_tuning_duration = max_tuning_duration
452
+ multiple_model_copies_enabled = True , max_tuning_duration = max_tuning_duration
500
453
)
501
454
502
455
@_capture_telemetry ("tgi_jumpstart.tune" )
503
456
def tune_for_tgi_jumpstart (self , max_tuning_duration : int = 1800 ):
504
- """Tune for Jumpstart Models with TGI serving """
457
+ """Tune for Jumpstart Models with TGI DLC """
505
458
return self ._tune_for_js (
506
459
# Currently, TGI does not enable multiple model copies serving.
507
460
multiple_model_copies_enabled = False ,
508
- max_tuning_duration = max_tuning_duration
461
+ max_tuning_duration = max_tuning_duration ,
509
462
)
510
463
511
464
def _build_for_jumpstart (self ):
@@ -528,6 +481,8 @@ def _build_for_jumpstart(self):
528
481
self .image_uri = self .pysdk_model .image_uri
529
482
530
483
self ._build_for_djl_jumpstart ()
484
+
485
+ self .pysdk_model .tune = self .tune_for_djl_jumpstart
531
486
elif "tgi-inference" in image_uri :
532
487
logger .info ("Building for TGI JumpStart Model ID..." )
533
488
self .model_server = ModelServer .TGI
@@ -536,13 +491,11 @@ def _build_for_jumpstart(self):
536
491
self .image_uri = self .pysdk_model .image_uri
537
492
538
493
self ._build_for_tgi_jumpstart ()
494
+
495
+ self .pysdk_model .tune = self .tune_for_tgi_jumpstart
539
496
else :
540
497
raise ValueError (
541
498
"JumpStart Model ID was not packaged with djl-inference or tgi-inference container."
542
499
)
543
500
544
- if self .model_server == ModelServer .TGI :
545
- self .pysdk_model .tune = self .tune_for_tgi_jumpstart
546
- elif self .model_server == ModelServer .DJL_SERVING :
547
- self .pysdk_model .tune = self .tune_for_djl_jumpstart
548
501
return self .pysdk_model
0 commit comments