Skip to content

Commit 3f3f947

Browse files
authored
Merge branch 'master' into tgi133
2 parents 364b0ff + aa5fd00 commit 3f3f947

File tree

15 files changed

+410
-26
lines changed

15 files changed

+410
-26
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
# Changelog
22

3+
## v2.202.0 (2023-12-21)
4+
5+
### Features
6+
7+
* support remote debug for sagemaker training job
8+
9+
### Bug Fixes and Other Changes
10+
11+
* update image_uri_configs 12-21-2023 08:32:41 PST
12+
* Update tblib constraint
13+
314
## v2.201.0 (2023-12-20)
415

516
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.201.1.dev0
1+
2.202.1.dev0

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def read_requirements(filename):
6363
"PyYAML~=6.0",
6464
"jsonschema",
6565
"platformdirs",
66-
"tblib==1.7.0",
66+
"tblib>=1.7.0,<3",
6767
"urllib3<1.27",
6868
"uvicorn==0.22.0",
6969
"fastapi==0.95.2",

src/sagemaker/automl/automl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ def attach(cls, auto_ml_job_name, sagemaker_session=None):
332332
total_job_runtime_in_seconds=auto_ml_job_desc.get("AutoMLJobConfig", {})
333333
.get("CompletionCriteria", {})
334334
.get("MaxAutoMLJobRuntimeInSeconds"),
335-
job_objective=auto_ml_job_desc.get("AutoMLJobObjective", {}).get("MetricName"),
335+
job_objective=auto_ml_job_desc.get("AutoMLJobObjective", {}),
336336
generate_candidate_definitions_only=auto_ml_job_desc.get(
337337
"GenerateCandidateDefinitionsOnly", False
338338
),

src/sagemaker/estimator.py

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ def __init__(
178178
container_entry_point: Optional[List[str]] = None,
179179
container_arguments: Optional[List[str]] = None,
180180
disable_output_compression: bool = False,
181+
enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None,
181182
**kwargs,
182183
):
183184
"""Initialize an ``EstimatorBase`` instance.
@@ -540,6 +541,8 @@ def __init__(
540541
to Amazon S3 without compression after training finishes.
541542
enable_infra_check (bool or PipelineVariable): Optional.
542543
Specifies whether it is running Sagemaker built-in infra check jobs.
544+
enable_remote_debug (bool or PipelineVariable): Optional.
545+
Specifies whether RemoteDebug is enabled for the training job
543546
"""
544547
instance_count = renamed_kwargs(
545548
"train_instance_count", "instance_count", instance_count, kwargs
@@ -777,6 +780,8 @@ def __init__(
777780

778781
self.tensorboard_app = TensorBoardApp(region=self.sagemaker_session.boto_region_name)
779782

783+
self._enable_remote_debug = enable_remote_debug
784+
780785
@abstractmethod
781786
def training_image_uri(self):
782787
"""Return the Docker image to use for training.
@@ -1958,6 +1963,11 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
19581963
max_wait = job_details.get("StoppingCondition", {}).get("MaxWaitTimeInSeconds")
19591964
if max_wait:
19601965
init_params["max_wait"] = max_wait
1966+
1967+
if "RemoteDebugConfig" in job_details:
1968+
init_params["enable_remote_debug"] = job_details["RemoteDebugConfig"].get(
1969+
"EnableRemoteDebug"
1970+
)
19611971
return init_params
19621972

19631973
def _get_instance_type(self):
@@ -2292,6 +2302,32 @@ def update_profiler(
22922302

22932303
_TrainingJob.update(self, profiler_rule_configs, profiler_config_request_dict)
22942304

2305+
def get_remote_debug_config(self):
2306+
"""dict: Return the configuration of RemoteDebug"""
2307+
return (
2308+
None
2309+
if self._enable_remote_debug is None
2310+
else {"EnableRemoteDebug": self._enable_remote_debug}
2311+
)
2312+
2313+
def enable_remote_debug(self):
2314+
"""Enable remote debug for a training job."""
2315+
self._update_remote_debug(True)
2316+
2317+
def disable_remote_debug(self):
2318+
"""Disable remote debug for a training job."""
2319+
self._update_remote_debug(False)
2320+
2321+
def _update_remote_debug(self, enable_remote_debug: bool):
2322+
"""Update to enable or disable remote debug for a training job.
2323+
2324+
This method updates the ``_enable_remote_debug`` parameter
2325+
and enables or disables remote debug for a training job
2326+
"""
2327+
self._ensure_latest_training_job()
2328+
_TrainingJob.update(self, remote_debug_config={"EnableRemoteDebug": enable_remote_debug})
2329+
self._enable_remote_debug = enable_remote_debug
2330+
22952331
def get_app_url(
22962332
self,
22972333
app_type,
@@ -2520,6 +2556,9 @@ def _get_train_args(cls, estimator, inputs, experiment_config):
25202556
if estimator.profiler_config:
25212557
train_args["profiler_config"] = estimator.profiler_config._to_request_dict()
25222558

2559+
if estimator.get_remote_debug_config() is not None:
2560+
train_args["remote_debug_config"] = estimator.get_remote_debug_config()
2561+
25232562
return train_args
25242563

25252564
@classmethod
@@ -2549,7 +2588,12 @@ def _is_local_channel(cls, input_uri):
25492588

25502589
@classmethod
25512590
def update(
2552-
cls, estimator, profiler_rule_configs=None, profiler_config=None, resource_config=None
2591+
cls,
2592+
estimator,
2593+
profiler_rule_configs=None,
2594+
profiler_config=None,
2595+
resource_config=None,
2596+
remote_debug_config=None,
25532597
):
25542598
"""Update a running Amazon SageMaker training job.
25552599
@@ -2562,20 +2606,31 @@ def update(
25622606
resource_config (dict): Configuration of the resources for the training job. You can
25632607
update the keep-alive period if the warm pool status is `Available`. No other fields
25642608
can be updated. (default: None).
2609+
remote_debug_config (dict): Configuration for RemoteDebug. (default: ``None``)
2610+
The dict can contain 'EnableRemoteDebug'(bool).
2611+
For example,
2612+
2613+
.. code:: python
2614+
2615+
remote_debug_config = {
2616+
"EnableRemoteDebug": True,
2617+
} (default: None).
25652618
25662619
Returns:
25672620
sagemaker.estimator._TrainingJob: Constructed object that captures
25682621
all information about the updated training job.
25692622
"""
25702623
update_args = cls._get_update_args(
2571-
estimator, profiler_rule_configs, profiler_config, resource_config
2624+
estimator, profiler_rule_configs, profiler_config, resource_config, remote_debug_config
25722625
)
25732626
estimator.sagemaker_session.update_training_job(**update_args)
25742627

25752628
return estimator.latest_training_job
25762629

25772630
@classmethod
2578-
def _get_update_args(cls, estimator, profiler_rule_configs, profiler_config, resource_config):
2631+
def _get_update_args(
2632+
cls, estimator, profiler_rule_configs, profiler_config, resource_config, remote_debug_config
2633+
):
25792634
"""Constructs a dict of arguments for updating an Amazon SageMaker training job.
25802635
25812636
Args:
@@ -2596,6 +2651,7 @@ def _get_update_args(cls, estimator, profiler_rule_configs, profiler_config, res
25962651
update_args.update(build_dict("profiler_rule_configs", profiler_rule_configs))
25972652
update_args.update(build_dict("profiler_config", profiler_config))
25982653
update_args.update(build_dict("resource_config", resource_config))
2654+
update_args.update(build_dict("remote_debug_config", remote_debug_config))
25992655

26002656
return update_args
26012657

@@ -2694,6 +2750,7 @@ def __init__(
26942750
container_arguments: Optional[List[str]] = None,
26952751
disable_output_compression: bool = False,
26962752
enable_infra_check: Optional[Union[bool, PipelineVariable]] = None,
2753+
enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None,
26972754
**kwargs,
26982755
):
26992756
"""Initialize an ``Estimator`` instance.
@@ -3055,6 +3112,8 @@ def __init__(
30553112
to Amazon S3 without compression after training finishes.
30563113
enable_infra_check (bool or PipelineVariable): Optional.
30573114
Specifies whether it is running Sagemaker built-in infra check jobs.
3115+
enable_remote_debug (bool or PipelineVariable): Optional.
3116+
Specifies whether RemoteDebug is enabled for the training job
30583117
"""
30593118
self.image_uri = image_uri
30603119
self._hyperparameters = hyperparameters.copy() if hyperparameters else {}
@@ -3106,6 +3165,7 @@ def __init__(
31063165
container_entry_point=container_entry_point,
31073166
container_arguments=container_arguments,
31083167
disable_output_compression=disable_output_compression,
3168+
enable_remote_debug=enable_remote_debug,
31093169
**kwargs,
31103170
)
31113171

src/sagemaker/image_uri_config/spark.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"ap-southeast-3": "800295151634",
2222
"ap-southeast-4": "819679513684",
2323
"ca-central-1": "446299261295",
24+
"ca-west-1": "000907499111",
2425
"cn-north-1": "671472414489",
2526
"cn-northwest-1": "844356804704",
2627
"eu-central-1": "906073651304",
@@ -61,6 +62,7 @@
6162
"ap-southeast-3": "800295151634",
6263
"ap-southeast-4": "819679513684",
6364
"ca-central-1": "446299261295",
65+
"ca-west-1": "000907499111",
6466
"cn-north-1": "671472414489",
6567
"cn-northwest-1": "844356804704",
6668
"eu-central-1": "906073651304",
@@ -101,6 +103,7 @@
101103
"ap-southeast-3": "800295151634",
102104
"ap-southeast-4": "819679513684",
103105
"ca-central-1": "446299261295",
106+
"ca-west-1": "000907499111",
104107
"cn-north-1": "671472414489",
105108
"cn-northwest-1": "844356804704",
106109
"eu-central-1": "906073651304",
@@ -141,6 +144,7 @@
141144
"ap-southeast-3": "800295151634",
142145
"ap-southeast-4": "819679513684",
143146
"ca-central-1": "446299261295",
147+
"ca-west-1": "000907499111",
144148
"cn-north-1": "671472414489",
145149
"cn-northwest-1": "844356804704",
146150
"eu-central-1": "906073651304",
@@ -181,6 +185,7 @@
181185
"ap-southeast-3": "800295151634",
182186
"ap-southeast-4": "819679513684",
183187
"ca-central-1": "446299261295",
188+
"ca-west-1": "000907499111",
184189
"cn-north-1": "671472414489",
185190
"cn-northwest-1": "844356804704",
186191
"eu-central-1": "906073651304",

src/sagemaker/jumpstart/estimator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def __init__(
106106
container_entry_point: Optional[List[str]] = None,
107107
container_arguments: Optional[List[str]] = None,
108108
disable_output_compression: Optional[bool] = None,
109+
enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None,
109110
):
110111
"""Initializes a ``JumpStartEstimator``.
111112
@@ -495,6 +496,8 @@ def __init__(
495496
a training job.
496497
disable_output_compression (Optional[bool]): When set to true, Model is uploaded
497498
to Amazon S3 without compression after training finishes.
499+
enable_remote_debug (bool or PipelineVariable): Optional.
500+
Specifies whether RemoteDebug is enabled for the training job
498501
499502
Raises:
500503
ValueError: If the model ID is not recognized by JumpStart.
@@ -569,6 +572,7 @@ def _is_valid_model_id_hook():
569572
container_arguments=container_arguments,
570573
disable_output_compression=disable_output_compression,
571574
enable_infra_check=enable_infra_check,
575+
enable_remote_debug=enable_remote_debug,
572576
)
573577

574578
self.model_id = estimator_init_kwargs.model_id

src/sagemaker/jumpstart/factory/estimator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ def get_init_kwargs(
127127
container_arguments: Optional[List[str]] = None,
128128
disable_output_compression: Optional[bool] = None,
129129
enable_infra_check: Optional[Union[bool, PipelineVariable]] = None,
130+
enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None,
130131
) -> JumpStartEstimatorInitKwargs:
131132
"""Returns kwargs required to instantiate `sagemaker.estimator.Estimator` object."""
132133

@@ -183,6 +184,7 @@ def get_init_kwargs(
183184
container_arguments=container_arguments,
184185
disable_output_compression=disable_output_compression,
185186
enable_infra_check=enable_infra_check,
187+
enable_remote_debug=enable_remote_debug,
186188
)
187189

188190
estimator_init_kwargs = _add_model_version_to_kwargs(estimator_init_kwargs)

src/sagemaker/jumpstart/types.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,6 +1280,7 @@ class JumpStartEstimatorInitKwargs(JumpStartKwargs):
12801280
"container_arguments",
12811281
"disable_output_compression",
12821282
"enable_infra_check",
1283+
"enable_remote_debug",
12831284
]
12841285

12851286
SERIALIZATION_EXCLUSION_SET = {
@@ -1344,6 +1345,7 @@ def __init__(
13441345
container_arguments: Optional[List[str]] = None,
13451346
disable_output_compression: Optional[bool] = None,
13461347
enable_infra_check: Optional[Union[bool, PipelineVariable]] = None,
1348+
enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None,
13471349
) -> None:
13481350
"""Instantiates JumpStartEstimatorInitKwargs object."""
13491351

@@ -1401,6 +1403,7 @@ def __init__(
14011403
self.container_arguments = container_arguments
14021404
self.disable_output_compression = disable_output_compression
14031405
self.enable_infra_check = enable_infra_check
1406+
self.enable_remote_debug = enable_remote_debug
14041407

14051408

14061409
class JumpStartEstimatorFitKwargs(JumpStartKwargs):

src/sagemaker/serve/utils/telemetry_logger.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,21 +59,35 @@ def wrapper(self, *args, **kwargs):
5959
caught_ex = None
6060

6161
image_uri_tail = self.image_uri.split("/")[1]
62-
extra = f"{func_name}&{MODEL_SERVER_TO_CODE[str(self.model_server)]}&{image_uri_tail}"
62+
extra = (
63+
f"{func_name}"
64+
f"&x-modelServer={MODEL_SERVER_TO_CODE[str(self.model_server)]}"
65+
f"&x-imageTag={image_uri_tail}"
66+
)
6367

6468
if self.model_server == ModelServer.DJL_SERVING or self.model_server == ModelServer.TGI:
65-
extra += f"&{self.model}"
69+
extra += f"&x-modelName={self.model}"
6670

6771
try:
6872
response = func(self, *args, **kwargs)
6973
if not self.serve_settings.telemetry_opt_out:
7074
_send_telemetry(
71-
"1", MODE_TO_CODE[str(self.mode)], self.sagemaker_session, None, extra
75+
"1",
76+
MODE_TO_CODE[str(self.mode)],
77+
self.sagemaker_session,
78+
None,
79+
None,
80+
extra,
7281
)
7382
except ModelBuilderException as e:
7483
if not self.serve_settings.telemetry_opt_out:
7584
_send_telemetry(
76-
"0", MODE_TO_CODE[str(self.mode)], self.sagemaker_session, str(e), extra
85+
"0",
86+
MODE_TO_CODE[str(self.mode)],
87+
self.sagemaker_session,
88+
str(e),
89+
e.__class__.__name__,
90+
extra,
7791
)
7892
caught_ex = e
7993
except Exception as e: # pylint: disable=W0703
@@ -93,13 +107,22 @@ def _send_telemetry(
93107
mode: int,
94108
session: Session,
95109
failure_reason: str = None,
110+
failure_type: str = None,
96111
extra_info: str = None,
97112
) -> None:
98113
"""Make GET request to an empty object in S3 bucket"""
99114
try:
100115
accountId = _get_accountId(session)
101116
region = _get_region_or_default(session)
102-
url = _construct_url(accountId, str(mode), status, failure_reason, extra_info, region)
117+
url = _construct_url(
118+
accountId,
119+
str(mode),
120+
status,
121+
failure_reason,
122+
failure_type,
123+
extra_info,
124+
region,
125+
)
103126
_requests_helper(url, 2)
104127
logger.debug("ModelBuilder metrics emitted.")
105128
except Exception: # pylint: disable=W0703
@@ -111,6 +134,7 @@ def _construct_url(
111134
mode: str,
112135
status: str,
113136
failure_reason: str,
137+
failure_type: str,
114138
extra_info: str,
115139
region: str,
116140
) -> str:
@@ -124,6 +148,7 @@ def _construct_url(
124148
)
125149
if failure_reason:
126150
base_url += f"&x-failureReason={failure_reason}"
151+
base_url += f"&x-failureType={failure_type}"
127152
if extra_info:
128153
base_url += f"&x-extra={extra_info}"
129154
return base_url

0 commit comments

Comments
 (0)