@@ -295,49 +295,42 @@ def _tune_for_js(
295
295
f"initial_env_vars: { initial_env_vars } ,"
296
296
f" admissible_tensor_parallel_degrees: { admissible_tensor_parallel_degrees } " )
297
297
298
- available_gpus = None
299
- if multiple_model_copies_enabled :
300
- available_gpus = _get_available_gpus ()
298
+ available_gpus = _get_available_gpus () if multiple_model_copies_enabled else None
301
299
self ._logging_debug (
302
300
f"multiple_model_copies_enabled: { multiple_model_copies_enabled } , available_gpus: { available_gpus } " )
303
301
304
302
benchmark_results = {}
305
303
best_tuned_combination = None
306
304
timeout = datetime .now () + timedelta (seconds = max_tuning_duration )
305
+
307
306
for tensor_parallel_degree in admissible_tensor_parallel_degrees :
308
307
if datetime .now () > timeout :
309
308
logger .info ("Max tuning duration reached. Tuning stopped." )
310
309
break
311
- try :
312
- self .pysdk_model .env .update ({
313
- num_shard_env_var : str (tensor_parallel_degree )
314
- })
315
- self ._logging_debug (
316
- f"num_shard_env_var: { num_shard_env_var } , tensor_parallel_degree: { tensor_parallel_degree } " )
317
- except Exception as e :
318
- self ._logging_debug (str (e ))
319
-
320
- logging_msg = f"{ num_shard_env_var } : { tensor_parallel_degree } ."
321
310
322
- sagemaker_model_server_workers = None
311
+ sagemaker_model_server_workers = 1
323
312
if multiple_model_copies_enabled :
324
313
sagemaker_model_server_workers = int (available_gpus / tensor_parallel_degree )
325
- self ._logging_debug (f"sagemaker_model_server_workers: { sagemaker_model_server_workers } " )
326
- try :
327
- self .pysdk_model .env .update ({
328
- num_model_copies_env_var : str (sagemaker_model_server_workers )
329
- })
330
- self ._logging_debug (
331
- f"num_model_copies_env_var: { num_model_copies_env_var } , "
332
- f"sagemaker_model_server_workers: { sagemaker_model_server_workers } " )
333
- except Exception as e :
334
- self ._logging_debug (str (e ))
335
- logging_msg = f"{ logging_msg } { num_model_copies_env_var } : { sagemaker_model_server_workers } ."
314
+
315
+ self .pysdk_model .env .update ({
316
+ num_shard_env_var : str (tensor_parallel_degree ),
317
+ num_model_copies_env_var : str (sagemaker_model_server_workers )
318
+ })
319
+
320
+ self ._logging_debug (
321
+ f"num_shard_env_var: { num_shard_env_var } , tensor_parallel_degree: { tensor_parallel_degree } " )
322
+ self ._logging_debug (f"sagemaker_model_server_workers: { sagemaker_model_server_workers } " )
323
+ self ._logging_debug (
324
+ f"num_model_copies_env_var: { num_model_copies_env_var } , "
325
+ f"sagemaker_model_server_workers: { sagemaker_model_server_workers } " )
326
+ self ._logging_debug (f"current model.env: { self .pysdk_model .env } " )
336
327
337
328
try :
329
+ self ._logging_debug (f"Deploying model with env config { self .pysdk_model .env } " )
338
330
predictor = self .pysdk_model .deploy (
339
331
model_data_download_timeout = max_tuning_duration
340
332
)
333
+ self ._logging_debug (f"Deployed model with env config { self .pysdk_model .env } " )
341
334
342
335
avg_latency , p90 , avg_tokens_per_second = _serial_benchmark (
343
336
predictor , self .schema_builder .sample_input
@@ -385,53 +378,71 @@ def _tune_for_js(
385
378
best_tuned_combination = tuned_configuration
386
379
except LocalDeepPingException as e :
387
380
logger .warning (
388
- f "Deployment unsuccessful with { num_shard_env_var } : %s. "
381
+ "Deployment unsuccessful with %s : %s. %s: %s "
389
382
"Failed to invoke the model server: %s" ,
383
+ num_shard_env_var ,
390
384
tensor_parallel_degree ,
385
+ num_model_copies_env_var ,
386
+ sagemaker_model_server_workers ,
391
387
str (e ),
392
388
)
393
389
break
394
390
except LocalModelOutOfMemoryException as e :
395
391
logger .warning (
396
- f"Deployment unsuccessful with { logging_msg } "
392
+ f"Deployment unsuccessful with %s: %s. %s: %s. "
397
393
"Out of memory when loading the model: %s" ,
394
+ num_shard_env_var ,
398
395
tensor_parallel_degree ,
396
+ num_model_copies_env_var ,
397
+ sagemaker_model_server_workers ,
399
398
str (e ),
400
399
)
401
400
break
402
401
except LocalModelInvocationException as e :
403
402
logger .warning (
404
- f"Deployment unsuccessful with { logging_msg } "
403
+ f"Deployment unsuccessful with %s: %s. %s: %s. "
405
404
"Failed to invoke the model server: %s"
406
405
"Please check that model server configurations are as expected "
407
406
"(Ex. serialization, deserialization, content_type, accept)." ,
407
+ num_shard_env_var ,
408
408
tensor_parallel_degree ,
409
+ num_model_copies_env_var ,
410
+ sagemaker_model_server_workers ,
409
411
str (e ),
410
412
)
411
413
break
412
414
except LocalModelLoadException as e :
413
415
logger .warning (
414
- f"Deployment unsuccessful with { logging_msg } "
416
+ f"Deployment unsuccessful with %s: %s. %s: %s. "
415
417
"Failed to load the model: %s." ,
418
+ num_shard_env_var ,
416
419
tensor_parallel_degree ,
420
+ num_model_copies_env_var ,
421
+ sagemaker_model_server_workers ,
417
422
str (e ),
418
423
)
419
424
break
420
425
except SkipTuningComboException as e :
421
426
logger .warning (
422
- f"Deployment with { logging_msg } "
427
+ f"Deployment with %s: %s. %s: %s. "
423
428
"was expected to be successful. However failed with: %s. "
424
429
"Trying next combination." ,
430
+ num_shard_env_var ,
425
431
tensor_parallel_degree ,
432
+ num_model_copies_env_var ,
433
+ sagemaker_model_server_workers ,
426
434
str (e ),
427
435
)
428
436
break
429
437
except Exception as e :
430
438
logger .exception (e )
431
439
logger .exception (
432
- f"Deployment unsuccessful with { logging_msg } "
440
+ f"Deployment unsuccessful with %s: %s. %s: %s. "
433
441
"with uncovered exception" ,
434
- tensor_parallel_degree
442
+ num_shard_env_var ,
443
+ tensor_parallel_degree ,
444
+ num_model_copies_env_var ,
445
+ sagemaker_model_server_workers ,
435
446
)
436
447
break
437
448
@@ -447,6 +458,8 @@ def _tune_for_js(
447
458
})
448
459
model_env_vars .append (num_model_copies_env_var )
449
460
461
+ self ._logging_debug (f"benchmark_results: { benchmark_results } , model_env_vars: { model_env_vars } " )
462
+
450
463
_pretty_print_benchmark_results (
451
464
benchmark_results ,
452
465
model_env_vars
@@ -482,7 +495,6 @@ def tune_for_djl_jumpstart(self, max_tuning_duration: int = 1800):
482
495
483
496
return self ._tune_for_js (
484
497
num_shard_env_var = num_shard_env_var ,
485
- # DJL does enable multiple model copies serving.
486
498
multiple_model_copies_enabled = True ,
487
499
max_tuning_duration = max_tuning_duration
488
500
)
@@ -491,7 +503,6 @@ def tune_for_djl_jumpstart(self, max_tuning_duration: int = 1800):
491
503
def tune_for_tgi_jumpstart (self , max_tuning_duration : int = 1800 ):
492
504
"""Tune for Jumpstart Models with TGI serving"""
493
505
return self ._tune_for_js (
494
- num_shard_env_var = "SM_NUM_GPUS" ,
495
506
# Currently, TGI does not enable multiple model copies serving.
496
507
multiple_model_copies_enabled = False ,
497
508
max_tuning_duration = max_tuning_duration
0 commit comments