Skip to content

Commit 0472607

Browse files
Merge branch 'master' into master
2 parents c260aa1 + f35b48c commit 0472607

File tree

20 files changed

+357
-10
lines changed

20 files changed

+357
-10
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
# Changelog
22

3+
## v2.182.0 (2023-08-29)
4+
5+
### Features
6+
7+
* image url for modelmonitor in TLV region
8+
* Enable spot training on remote decorator and executor
9+
10+
## v2.181.0 (2023-08-28)
11+
12+
### Features
13+
14+
* StabilityAI DLC Image URIs
15+
16+
### Bug Fixes and Other Changes
17+
18+
* temporarily skip kmeans notebook
19+
320
## v2.180.0 (2023-08-24)
421

522
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.180.1.dev0
1+
2.182.1.dev0

src/sagemaker/image_uri_config/model-monitor.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
"us-east-1": "156813124566",
2727
"us-east-2": "777275614652",
2828
"us-west-1": "890145073186",
29-
"us-west-2": "159807026194"
29+
"us-west-2": "159807026194",
30+
"il-central-1": "843974653677"
3031
},
3132
"repository": "sagemaker-model-monitor-analyzer"
3233
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
{
2+
"inference": {
3+
"processors": ["gpu"],
4+
"version_aliases": {
5+
"0.1": "0.1.0"
6+
},
7+
"versions": {
8+
"0.1.0": {
9+
"py_versions": ["py310"],
10+
"registries": {
11+
"af-south-1": "626614931356",
12+
"il-central-1": "780543022126",
13+
"ap-east-1": "871362719292",
14+
"ap-northeast-1": "763104351884",
15+
"ap-northeast-2": "763104351884",
16+
"ap-northeast-3": "364406365360",
17+
"ap-south-1": "763104351884",
18+
"ap-south-2": "772153158452",
19+
"ap-southeast-1": "763104351884",
20+
"ap-southeast-2": "763104351884",
21+
"ap-southeast-3": "907027046896",
22+
"ap-southeast-4": "457447274322",
23+
"ca-central-1": "763104351884",
24+
"eu-central-1": "763104351884",
25+
"eu-central-2": "380420809688",
26+
"eu-north-1": "763104351884",
27+
"eu-west-1": "763104351884",
28+
"eu-west-2": "763104351884",
29+
"eu-west-3": "763104351884",
30+
"eu-south-1": "692866216735",
31+
"eu-south-2": "503227376785",
32+
"me-south-1": "217643126080",
33+
"me-central-1": "914824155844",
34+
"sa-east-1": "763104351884",
35+
"us-east-1": "763104351884",
36+
"us-east-2": "763104351884",
37+
"us-west-1": "763104351884",
38+
"us-west-2": "763104351884"
39+
},
40+
"tag_prefix": "2.0.1-sgm0.1.0",
41+
"repository": "stabilityai-pytorch-inference",
42+
"container_version": {
43+
"gpu": "cu118-ubuntu20.04-sagemaker"
44+
}
45+
}
46+
}
47+
}
48+
}

src/sagemaker/image_uris.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
TRAINIUM_ALLOWED_FRAMEWORKS = "pytorch"
4040
INFERENCE_GRAVITON = "inference_graviton"
4141
DATA_WRANGLER_FRAMEWORK = "data-wrangler"
42+
STABILITYAI_FRAMEWORK = "stabilityai"
4243

4344

4445
@override_pipeline_parameter_var
@@ -476,7 +477,11 @@ def _validate_version_and_set_if_needed(version, config, framework):
476477

477478
return available_versions[0]
478479

479-
if version is None and framework in [DATA_WRANGLER_FRAMEWORK, HUGGING_FACE_LLM_FRAMEWORK]:
480+
if version is None and framework in [
481+
DATA_WRANGLER_FRAMEWORK,
482+
HUGGING_FACE_LLM_FRAMEWORK,
483+
STABILITYAI_FRAMEWORK,
484+
]:
480485
version = _get_latest_versions(available_versions)
481486

482487
_validate_arg(version, available_versions + aliased_versions, "{} version".format(framework))
@@ -614,6 +619,7 @@ def _format_tag(tag_prefix, processor, py_version, container_version, inference_
614619
return "-".join(x for x in (tag_prefix, processor, py_version, container_version) if x)
615620

616621

622+
@override_pipeline_parameter_var
617623
def get_training_image_uri(
618624
region,
619625
framework,

src/sagemaker/remote_function/client.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ def remote(
8383
volume_size: int = 30,
8484
encrypt_inter_container_traffic: bool = None,
8585
spark_config: SparkConfig = None,
86+
use_spot_instances=False,
87+
max_wait_time_in_seconds=None,
8688
):
8789
"""Decorator for running the annotated function as a SageMaker training job.
8890
@@ -255,6 +257,14 @@ def remote(
255257
Spark image. If ``spark_config`` is specified, a SageMaker Spark image uri
256258
will be used for training. Note that ``image_uri`` can not be specified at the
257259
same time otherwise a ``ValueError`` is thrown. Defaults to ``None``.
260+
261+
use_spot_instances (bool): Specifies whether to use SageMaker Managed Spot instances for
262+
training. If enabled then the ``max_wait_time_in_seconds`` arg should also be set.
263+
Defaults to ``False``.
264+
265+
max_wait_time_in_seconds (int): Timeout in seconds waiting for spot training job.
266+
After this amount of time Amazon SageMaker will stop waiting for managed spot training
267+
job to complete. Defaults to ``None``.
258268
"""
259269

260270
def _remote(func):
@@ -284,6 +294,8 @@ def _remote(func):
284294
volume_size=volume_size,
285295
encrypt_inter_container_traffic=encrypt_inter_container_traffic,
286296
spark_config=spark_config,
297+
use_spot_instances=use_spot_instances,
298+
max_wait_time_in_seconds=max_wait_time_in_seconds,
287299
)
288300

289301
@functools.wraps(func)
@@ -492,6 +504,8 @@ def __init__(
492504
volume_size: int = 30,
493505
encrypt_inter_container_traffic: bool = None,
494506
spark_config: SparkConfig = None,
507+
use_spot_instances=False,
508+
max_wait_time_in_seconds=None,
495509
):
496510
"""Constructor for RemoteExecutor
497511
@@ -670,6 +684,14 @@ def __init__(
670684
Spark image. If ``spark_config`` is specified, a SageMaker Spark image uri
671685
will be used for training. Note that ``image_uri`` can not be specified at the
672686
same time otherwise a ``ValueError`` is thrown. Defaults to ``None``.
687+
688+
use_spot_instances (bool): Specifies whether to use SageMaker Managed Spot instances for
689+
training. If enabled then the ``max_wait_time_in_seconds`` arg should also be set.
690+
Defaults to ``False``.
691+
692+
max_wait_time_in_seconds (int): Timeout in seconds waiting for spot training job.
693+
After this amount of time Amazon SageMaker will stop waiting for managed spot training
694+
job to complete. Defaults to ``None``.
673695
"""
674696
self.max_parallel_jobs = max_parallel_jobs
675697

@@ -707,6 +729,8 @@ def __init__(
707729
volume_size=volume_size,
708730
encrypt_inter_container_traffic=encrypt_inter_container_traffic,
709731
spark_config=spark_config,
732+
use_spot_instances=use_spot_instances,
733+
max_wait_time_in_seconds=max_wait_time_in_seconds,
710734
)
711735

712736
self._state_condition = threading.Condition()

src/sagemaker/remote_function/job.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,8 @@ def __init__(
191191
volume_size: int = 30,
192192
encrypt_inter_container_traffic: bool = None,
193193
spark_config: SparkConfig = None,
194+
use_spot_instances=False,
195+
max_wait_time_in_seconds=None,
194196
):
195197
"""Initialize a _JobSettings instance which configures the remote job.
196198
@@ -353,6 +355,14 @@ def __init__(
353355
Spark image. If ``spark_config`` is specified, a SageMaker Spark image uri
354356
will be used for training. Note that ``image_uri`` can not be specified at the
355357
same time otherwise a ``ValueError`` is thrown. Defaults to ``None``.
358+
359+
use_spot_instances (bool): Specifies whether to use SageMaker Managed Spot instances for
360+
training. If enabled then the ``max_wait`` arg should also be set.
361+
Defaults to ``False``.
362+
363+
max_wait_time_in_seconds (int): Timeout in seconds waiting for spot training job.
364+
After this amount of time Amazon SageMaker will stop waiting for managed spot
365+
training job to complete. Defaults to ``None``.
356366
"""
357367
self.sagemaker_session = sagemaker_session or Session()
358368
self.environment_variables = resolve_value_from_config(
@@ -439,6 +449,8 @@ def __init__(
439449
self.max_retry_attempts = max_retry_attempts
440450
self.keep_alive_period_in_seconds = keep_alive_period_in_seconds
441451
self.spark_config = spark_config
452+
self.use_spot_instances = use_spot_instances
453+
self.max_wait_time_in_seconds = max_wait_time_in_seconds
442454
self.job_conda_env = resolve_value_from_config(
443455
direct_input=job_conda_env,
444456
config_path=REMOTE_FUNCTION_JOB_CONDA_ENV,
@@ -648,12 +660,16 @@ def start(job_settings: _JobSettings, func, func_args, func_kwargs, run_info=Non
648660

649661
stored_function.save(func, *func_args, **func_kwargs)
650662

663+
stopping_condition = {
664+
"MaxRuntimeInSeconds": job_settings.max_runtime_in_seconds,
665+
}
666+
if job_settings.max_wait_time_in_seconds is not None:
667+
stopping_condition["MaxWaitTimeInSeconds"] = job_settings.max_wait_time_in_seconds
668+
651669
request_dict = dict(
652670
TrainingJobName=job_name,
653671
RoleArn=job_settings.role,
654-
StoppingCondition={
655-
"MaxRuntimeInSeconds": job_settings.max_runtime_in_seconds,
656-
},
672+
StoppingCondition=stopping_condition,
657673
RetryStrategy={"MaximumRetryAttempts": job_settings.max_retry_attempts},
658674
)
659675

@@ -742,6 +758,8 @@ def start(job_settings: _JobSettings, func, func_args, func_kwargs, run_info=Non
742758
if job_settings.vpc_config:
743759
request_dict["VpcConfig"] = job_settings.vpc_config
744760

761+
request_dict["EnableManagedSpotTraining"] = job_settings.use_spot_instances
762+
745763
request_dict["Environment"] = job_settings.environment_variables
746764

747765
extended_request = _extend_spark_config_to_request(request_dict, job_settings, s3_base_uri)

src/sagemaker/stabilityai/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
"""StabilityAI module."""
14+
from __future__ import absolute_import
15+
16+
from sagemaker.stabilityai.stability_utils import get_stabilityai_image_uri # noqa: F401
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
"""Utility functions."""
14+
15+
from __future__ import absolute_import
16+
17+
from typing import Optional
18+
19+
from sagemaker import image_uris
20+
from sagemaker.session import Session
21+
22+
23+
def get_stabilityai_image_uri(
24+
session: Optional[Session] = None,
25+
region: Optional[str] = None,
26+
version: Optional[str] = None,
27+
image_scope: Optional[str] = "inference",
28+
) -> str:
29+
"""Very basic utility function to fetch image URI of StabilityAI images.
30+
31+
Args:
32+
session (Session): SageMaker session.
33+
region (str): AWS region of image URI.
34+
version (str): Framework version. Latest version used if not specified.
35+
image_scope (str): Image type. e.g. inference, training
36+
Returns:
37+
Image URI string.
38+
"""
39+
40+
if region is None:
41+
if session is None:
42+
region = Session().boto_session.region_name
43+
else:
44+
region = session.boto_session.region_name
45+
return image_uris.retrieve(
46+
framework="stabilityai",
47+
region=region,
48+
version=version,
49+
image_scope=image_scope,
50+
)

src/sagemaker/workflow/utilities.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -390,8 +390,10 @@ def override_pipeline_parameter_var(func):
390390
We should remove this decorator after the grace period.
391391
"""
392392
warning_msg_template = (
393-
"The input argument %s of function (%s) is a pipeline variable (%s), which is not allowed. "
394-
"The default_value of this Parameter object will be used to override it. "
393+
"The input argument %s of function (%s) is a pipeline variable (%s), "
394+
"which is interpreted in pipeline execution time only. "
395+
"As the function needs to evaluate the argument value in SDK compile time, "
396+
"the default_value of this Parameter object will be used to override it. "
395397
"Please make sure the default_value is valid."
396398
)
397399

tests/integ/sagemaker/remote_function/test_decorator.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,24 @@ def get_file_content(file_names):
608608
assert "line 2: bws: command not found" in str(e)
609609

610610

611+
def test_decorator_with_spot_instances(
612+
sagemaker_session, dummy_container_without_error, cpu_instance_type
613+
):
614+
@remote(
615+
role=ROLE,
616+
image_uri=dummy_container_without_error,
617+
instance_type=cpu_instance_type,
618+
sagemaker_session=sagemaker_session,
619+
use_spot_instances=True,
620+
max_wait_time_in_seconds=48 * 60 * 60,
621+
)
622+
def divide(x, y):
623+
return x / y
624+
625+
assert divide(10, 2) == 5
626+
assert divide(20, 2) == 10
627+
628+
611629
@pytest.mark.skip
612630
def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type):
613631
@remote(

tests/integ/sagemaker/remote_function/test_executor.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,46 @@ def cube(x):
195195
assert metric_summary.avg == 550
196196

197197

198+
def test_executor_submit_using_spot_instances(
199+
sagemaker_session, dummy_container_without_error, cpu_instance_type
200+
):
201+
def square_on_spot_instance(x):
202+
return x * x
203+
204+
def cube_on_spot_instance(x):
205+
return x * x * x
206+
207+
with RemoteExecutor(
208+
max_parallel_jobs=1,
209+
role=ROLE,
210+
image_uri=dummy_container_without_error,
211+
instance_type=cpu_instance_type,
212+
sagemaker_session=sagemaker_session,
213+
use_spot_instances=True,
214+
max_wait_time_in_seconds=48 * 60 * 60,
215+
) as e:
216+
future_1 = e.submit(square_on_spot_instance, 10)
217+
future_2 = e.submit(cube_on_spot_instance, 10)
218+
219+
assert future_1.result() == 100
220+
assert future_2.result() == 1000
221+
222+
assert get_future(future_1._job.job_name, sagemaker_session).result() == 100
223+
assert get_future(future_2._job.job_name, sagemaker_session).result() == 1000
224+
225+
describe_job_1 = next(
226+
list_futures(job_name_prefix="square-on-spot-instance", sagemaker_session=sagemaker_session)
227+
)._job.describe()
228+
assert describe_job_1["EnableManagedSpotTraining"] is True
229+
assert describe_job_1["StoppingCondition"]["MaxWaitTimeInSeconds"] == 172800
230+
231+
describe_job_2 = next(
232+
list_futures(job_name_prefix="cube-on-spot-instance", sagemaker_session=sagemaker_session)
233+
)._job.describe()
234+
assert describe_job_2["EnableManagedSpotTraining"] is True
235+
assert describe_job_2["StoppingCondition"]["MaxWaitTimeInSeconds"] == 172800
236+
237+
198238
def test_executor_map_with_run(sagemaker_session, dummy_container_without_error, cpu_instance_type):
199239
def square(x):
200240
with load_run() as run:

0 commit comments

Comments
 (0)