Skip to content

Commit e08e1c0

Browse files
author
Jonathan Makunga
committed
Refactoring and Debugging
1 parent b96e78f commit e08e1c0

File tree

2 files changed

+48
-37
lines changed

2 files changed

+48
-37
lines changed

src/sagemaker/serve/builder/jumpstart_builder.py

Lines changed: 45 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -295,49 +295,42 @@ def _tune_for_js(
295295
f"initial_env_vars: {initial_env_vars},"
296296
f" admissible_tensor_parallel_degrees: {admissible_tensor_parallel_degrees}")
297297

298-
available_gpus = None
299-
if multiple_model_copies_enabled:
300-
available_gpus = _get_available_gpus()
298+
available_gpus = _get_available_gpus() if multiple_model_copies_enabled else None
301299
self._logging_debug(
302300
f"multiple_model_copies_enabled: {multiple_model_copies_enabled}, available_gpus: {available_gpus}")
303301

304302
benchmark_results = {}
305303
best_tuned_combination = None
306304
timeout = datetime.now() + timedelta(seconds=max_tuning_duration)
305+
307306
for tensor_parallel_degree in admissible_tensor_parallel_degrees:
308307
if datetime.now() > timeout:
309308
logger.info("Max tuning duration reached. Tuning stopped.")
310309
break
311-
try:
312-
self.pysdk_model.env.update({
313-
num_shard_env_var: str(tensor_parallel_degree)
314-
})
315-
self._logging_debug(
316-
f"num_shard_env_var: {num_shard_env_var}, tensor_parallel_degree: {tensor_parallel_degree}")
317-
except Exception as e:
318-
self._logging_debug(str(e))
319-
320-
logging_msg = f"{num_shard_env_var}: {tensor_parallel_degree}."
321310

322-
sagemaker_model_server_workers = None
311+
sagemaker_model_server_workers = 1
323312
if multiple_model_copies_enabled:
324313
sagemaker_model_server_workers = int(available_gpus / tensor_parallel_degree)
325-
self._logging_debug(f"sagemaker_model_server_workers: {sagemaker_model_server_workers}")
326-
try:
327-
self.pysdk_model.env.update({
328-
num_model_copies_env_var: str(sagemaker_model_server_workers)
329-
})
330-
self._logging_debug(
331-
f"num_model_copies_env_var: {num_model_copies_env_var}, "
332-
f"sagemaker_model_server_workers: {sagemaker_model_server_workers}")
333-
except Exception as e:
334-
self._logging_debug(str(e))
335-
logging_msg = f"{logging_msg} {num_model_copies_env_var}: {sagemaker_model_server_workers}."
314+
315+
self.pysdk_model.env.update({
316+
num_shard_env_var: str(tensor_parallel_degree),
317+
num_model_copies_env_var: str(sagemaker_model_server_workers)
318+
})
319+
320+
self._logging_debug(
321+
f"num_shard_env_var: {num_shard_env_var}, tensor_parallel_degree: {tensor_parallel_degree}")
322+
self._logging_debug(f"sagemaker_model_server_workers: {sagemaker_model_server_workers}")
323+
self._logging_debug(
324+
f"num_model_copies_env_var: {num_model_copies_env_var}, "
325+
f"sagemaker_model_server_workers: {sagemaker_model_server_workers}")
326+
self._logging_debug(f"current model.env: {self.pysdk_model.env}")
336327

337328
try:
329+
self._logging_debug(f"Deploying model with env config {self.pysdk_model.env}")
338330
predictor = self.pysdk_model.deploy(
339331
model_data_download_timeout=max_tuning_duration
340332
)
333+
self._logging_debug(f"Deployed model with env config {self.pysdk_model.env}")
341334

342335
avg_latency, p90, avg_tokens_per_second = _serial_benchmark(
343336
predictor, self.schema_builder.sample_input
@@ -385,53 +378,71 @@ def _tune_for_js(
385378
best_tuned_combination = tuned_configuration
386379
except LocalDeepPingException as e:
387380
logger.warning(
388-
f"Deployment unsuccessful with {num_shard_env_var}: %s. "
381+
"Deployment unsuccessful with %s: %s. %s: %s"
389382
"Failed to invoke the model server: %s",
383+
num_shard_env_var,
390384
tensor_parallel_degree,
385+
num_model_copies_env_var,
386+
sagemaker_model_server_workers,
391387
str(e),
392388
)
393389
break
394390
except LocalModelOutOfMemoryException as e:
395391
logger.warning(
396-
f"Deployment unsuccessful with {logging_msg} "
392+
f"Deployment unsuccessful with %s: %s. %s: %s. "
397393
"Out of memory when loading the model: %s",
394+
num_shard_env_var,
398395
tensor_parallel_degree,
396+
num_model_copies_env_var,
397+
sagemaker_model_server_workers,
399398
str(e),
400399
)
401400
break
402401
except LocalModelInvocationException as e:
403402
logger.warning(
404-
f"Deployment unsuccessful with {logging_msg} "
403+
f"Deployment unsuccessful with %s: %s. %s: %s. "
405404
"Failed to invoke the model server: %s"
406405
"Please check that model server configurations are as expected "
407406
"(Ex. serialization, deserialization, content_type, accept).",
407+
num_shard_env_var,
408408
tensor_parallel_degree,
409+
num_model_copies_env_var,
410+
sagemaker_model_server_workers,
409411
str(e),
410412
)
411413
break
412414
except LocalModelLoadException as e:
413415
logger.warning(
414-
f"Deployment unsuccessful with {logging_msg} "
416+
f"Deployment unsuccessful with %s: %s. %s: %s. "
415417
"Failed to load the model: %s.",
418+
num_shard_env_var,
416419
tensor_parallel_degree,
420+
num_model_copies_env_var,
421+
sagemaker_model_server_workers,
417422
str(e),
418423
)
419424
break
420425
except SkipTuningComboException as e:
421426
logger.warning(
422-
f"Deployment with {logging_msg} "
427+
f"Deployment with %s: %s. %s: %s. "
423428
"was expected to be successful. However failed with: %s. "
424429
"Trying next combination.",
430+
num_shard_env_var,
425431
tensor_parallel_degree,
432+
num_model_copies_env_var,
433+
sagemaker_model_server_workers,
426434
str(e),
427435
)
428436
break
429437
except Exception as e:
430438
logger.exception(e)
431439
logger.exception(
432-
f"Deployment unsuccessful with {logging_msg} "
440+
f"Deployment unsuccessful with %s: %s. %s: %s. "
433441
"with uncovered exception",
434-
tensor_parallel_degree
442+
num_shard_env_var,
443+
tensor_parallel_degree,
444+
num_model_copies_env_var,
445+
sagemaker_model_server_workers,
435446
)
436447
break
437448

@@ -447,6 +458,8 @@ def _tune_for_js(
447458
})
448459
model_env_vars.append(num_model_copies_env_var)
449460

461+
self._logging_debug(f"benchmark_results: {benchmark_results}, model_env_vars: {model_env_vars}")
462+
450463
_pretty_print_benchmark_results(
451464
benchmark_results,
452465
model_env_vars
@@ -482,7 +495,6 @@ def tune_for_djl_jumpstart(self, max_tuning_duration: int = 1800):
482495

483496
return self._tune_for_js(
484497
num_shard_env_var=num_shard_env_var,
485-
# DJL does enable multiple model copies serving.
486498
multiple_model_copies_enabled=True,
487499
max_tuning_duration=max_tuning_duration
488500
)
@@ -491,7 +503,6 @@ def tune_for_djl_jumpstart(self, max_tuning_duration: int = 1800):
491503
def tune_for_tgi_jumpstart(self, max_tuning_duration: int = 1800):
492504
"""Tune for Jumpstart Models with TGI serving"""
493505
return self._tune_for_js(
494-
num_shard_env_var="SM_NUM_GPUS",
495506
# Currently, TGI does not enable multiple model copies serving.
496507
multiple_model_copies_enabled=False,
497508
max_tuning_duration=max_tuning_duration

tests/integ/sagemaker/serve/test_serve_tune_js.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,13 @@
3333
SAMPLE_RESPONSE = [
3434
{"generated_text": "Hello, I'm a language model, and I'm here to help you with your English."}
3535
]
36-
TGI_JS_MODEL_ID = "huggingface-llm-amazon-falconlite"
36+
JS_MODEL_ID = "huggingface-textgeneration-gpt2"
3737

3838

3939
@pytest.fixture
4040
def model_builder(sagemaker_local_session):
4141
return ModelBuilder(
42-
model=TGI_JS_MODEL_ID,
42+
model=JS_MODEL_ID,
4343
schema_builder=SchemaBuilder(SAMPLE_PROMPT, SAMPLE_RESPONSE),
4444
mode=Mode.LOCAL_CONTAINER,
4545
sagemaker_session=sagemaker_local_session,
@@ -50,7 +50,7 @@ def model_builder(sagemaker_local_session):
5050
# PYTHON_VERSION_IS_NOT_310,
5151
# reason="The goal of these tests are to test the serving components of our feature",
5252
# )
53-
@pytest.mark.local_mode
53+
# @pytest.mark.local_mode
5454
def test_happy_tgi_sagemaker_endpoint(model_builder, gpu_instance_type):
5555
logger.info("Running in LOCAL_CONTAINER mode...")
5656
caught_ex = None

0 commit comments

Comments
 (0)