Skip to content

Commit 7989603

Browse files
authored
Merge branch 'master' into update-hf-pt-train-dlc
2 parents 972a60f + 7f823e1 commit 7989603

File tree

12 files changed

+284
-104
lines changed

12 files changed

+284
-104
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
# Changelog
22

3+
## v2.136.0 (2023-03-09)
4+
5+
### Features
6+
7+
* with_feature_group [feature_store]
8+
* Djl Large Model Support
9+
* Decouple model.right_size() from model registry
10+
11+
### Bug Fixes and Other Changes
12+
13+
* Fix integration test error in test_default_right_size_and_deploy_unregistered_base_model
14+
* Add djl 0.21.0 dlc images
15+
16+
### Documentation Changes
17+
18+
* Torchrun gpu support documentation change
19+
320
## v2.135.1.post0 (2023-03-02)
421

522
### Documentation Changes

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.135.2.dev0
1+
2.136.1.dev0

doc/frameworks/pytorch/using_pytorch.rst

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ fit Optional Arguments
196196
- ``logs``: Defaults to True, whether to show logs produced by training
197197
job in the Python session. Only meaningful when wait is True.
198198

199+
----
199200

200201
Distributed PyTorch Training
201202
============================
@@ -262,16 +263,18 @@ during the PyTorch DDP initialization.
262263

263264
.. note::
264265

265-
The SageMaker PyTorch estimator operates ``mpirun`` in the backend.
266-
It doesn’t use ``torchrun`` for distributed training.
266+
The SageMaker PyTorch estimator can operate both ``mpirun`` (for PyTorch 1.12.0 and later)
267+
and ``torchrun`` (for PyTorch 1.13.1 and later) in the backend for distributed training.
267268

268269
For more information about setting up PyTorch DDP in your training script,
269270
see `Getting Started with Distributed Data Parallel
270271
<https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_ in the
271272
PyTorch documentation.
272273

273-
The following example shows how to run a PyTorch DDP training in SageMaker
274-
using two ``ml.p4d.24xlarge`` instances:
274+
The following examples show how to set a PyTorch estimator
275+
to run a distributed training job on two ``ml.p4d.24xlarge`` instances.
276+
277+
**Using PyTorch DDP with the mpirun backend**
275278

276279
.. code:: python
277280
@@ -291,7 +294,34 @@ using two ``ml.p4d.24xlarge`` instances:
291294
}
292295
)
293296
294-
pt_estimator.fit("s3://bucket/path/to/training/data")
297+
**Using PyTorch DDP with the torchrun backend**
298+
299+
.. code:: python
300+
301+
from sagemaker.pytorch import PyTorch
302+
303+
pt_estimator = PyTorch(
304+
entry_point="train_ptddp.py",
305+
role="SageMakerRole",
306+
framework_version="1.13.1",
307+
py_version="py38",
308+
instance_count=2,
309+
instance_type="ml.p4d.24xlarge",
310+
distribution={
311+
"torch_distributed": {
312+
"enabled": True
313+
}
314+
}
315+
)
316+
317+
318+
.. note::
319+
320+
For more information about setting up ``torchrun`` in your training script,
321+
see `torchrun (Elastic Launch) <https://pytorch.org/docs/stable/elastic/run.html>`_ in *the
322+
PyTorch documentation*.
323+
324+
----
295325

296326
.. _distributed-pytorch-training-on-trainium:
297327

@@ -324,7 +354,7 @@ with the ``torch_distributed`` option as the distribution strategy.
324354

325355
.. note::
326356

327-
SageMaker Debugger is currently not supported with Trn1 instances.
357+
SageMaker Debugger is not compatible with Trn1 instances.
328358

329359
Adapt Your Training Script to Initialize with the XLA backend
330360
-------------------------------------------------------------

src/sagemaker/clarify.py

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
in (
5050
"text/csv",
5151
"application/jsonlines",
52+
"application/json",
5253
"application/sagemakercapturejson",
5354
"application/x-parquet",
5455
"application/x-image",
@@ -311,7 +312,7 @@ def __init__(
311312
s3_analysis_config_output_path: Optional[str] = None,
312313
label: Optional[str] = None,
313314
headers: Optional[List[str]] = None,
314-
features: Optional[List[str]] = None,
315+
features: Optional[str] = None,
315316
dataset_type: str = "text/csv",
316317
s3_compression_type: str = "None",
317318
joinsource: Optional[Union[str, int]] = None,
@@ -331,12 +332,18 @@ def __init__(
331332
If this field is None, then the ``s3_output_path`` will be used
332333
to store the ``analysis_config`` output.
333334
label (str): Target attribute of the model required by bias metrics. Specified as
334-
column name or index for CSV dataset or as JMESPath expression for JSONLines.
335+
column name or index for CSV dataset or a JMESPath expression for JSON/JSON Lines.
335336
*Required parameter* except for when the input dataset does not contain the label.
336-
features (List[str]): JMESPath expression to locate the feature columns for
337-
bias metrics if the dataset format is JSONLines.
337+
Note: For JSON, the JMESPath query must result in a list of labels for each
338+
sample. For JSON Lines, it must result in the label for each line.
339+
Only a single label per sample is supported at this time.
340+
features (str): JMESPath expression to locate the feature values
341+
if the dataset format is JSON/JSON Lines.
342+
Note: For JSON, the JMESPath query must result in a 2-D list (or a matrix) of
343+
feature values. For JSON Lines, it must result in a 1-D list of features for each
344+
line.
338345
dataset_type (str): Format of the dataset. Valid values are ``"text/csv"`` for CSV,
339-
``"application/jsonlines"`` for JSONLines, and
346+
``"application/jsonlines"`` for JSON Lines, ``"application/json"`` for JSON, and
340347
``"application/x-parquet"`` for Parquet.
341348
s3_compression_type (str): Valid options are "None" or ``"Gzip"``.
342349
joinsource (str or int): The name or index of the column in the dataset that
@@ -359,6 +366,7 @@ def __init__(
359366
360367
Clarify will not use the ``joinsource`` column and columns present in the facet
361368
dataset when calling model inference APIs.
369+
Note: this is only supported for ``"text/csv"`` dataset type.
362370
facet_headers (list[str]): List of column names in the facet dataset.
363371
predicted_label_dataset_uri (str): Dataset S3 prefix/object URI with predicted labels,
364372
which are used directly for analysis instead of making model inference API calls.
@@ -368,11 +376,16 @@ def __init__(
368376
* If the dataset and predicted label dataset are in multiple files (either one),
369377
then an index column, ``joinsource``, is required to join the two datasets.
370378
379+
Note: this is only supported for ``"text/csv"`` dataset type.
371380
predicted_label_headers (list[str]): List of column names in the predicted label dataset
372381
predicted_label (str or int): Predicted label of the target attribute of the model
373-
required for running bias analysis. Specified as column name or index for CSV data.
382+
required for running bias analysis. Specified as column name or index for CSV data,
383+
or a JMESPath expression for JSON/JSON Lines.
374384
Clarify uses the predicted labels directly instead of making model inference API
375385
calls.
386+
Note: For JSON, the JMESPath query must result in a list of predicted labels for
387+
each sample. For JSON Lines, it must result in the predicted label for each line.
388+
Only a single predicted label per sample is supported at this time.
376389
excluded_columns (list[int] or list[str]): A list of names or indices of the columns
377390
which are to be excluded from making model inference API calls.
378391
@@ -384,15 +397,21 @@ def __init__(
384397
if dataset_type not in [
385398
"text/csv",
386399
"application/jsonlines",
400+
"application/json",
387401
"application/x-parquet",
388402
"application/x-image",
389403
]:
390404
raise ValueError(
391405
f"Invalid dataset_type '{dataset_type}'."
392406
f" Please check the API documentation for the supported dataset types."
393407
)
394-
# parameters for analysis on datasets without facets are only supported for CSV datasets
395-
if dataset_type != "text/csv":
408+
# predicted_label and excluded_columns are only supported for tabular datasets
409+
if dataset_type not in [
410+
"text/csv",
411+
"application/jsonlines",
412+
"application/json",
413+
"application/x-parquet",
414+
]:
396415
if predicted_label:
397416
raise ValueError(
398417
f"The parameter 'predicted_label' is not supported"
@@ -405,6 +424,8 @@ def __init__(
405424
f" for dataset_type '{dataset_type}'."
406425
f" Please check the API documentation for the supported dataset types."
407426
)
427+
# parameters for analysis on datasets without facets are only supported for CSV datasets
428+
if dataset_type != "text/csv":
408429
if facet_dataset_uri or facet_headers:
409430
raise ValueError(
410431
f"The parameters 'facet_dataset_uri' and 'facet_headers'"
@@ -417,6 +438,9 @@ def __init__(
417438
f" are not supported for dataset_type '{dataset_type}'."
418439
f" Please check the API documentation for the supported dataset types."
419440
)
441+
# features JMESPath is required for JSON as we can't derive it ourselves
442+
if dataset_type == "application/json" and features is None:
443+
raise ValueError("features JMESPath is required for application/json dataset_type")
420444
self.s3_data_input_path = s3_data_input_path
421445
self.s3_output_path = s3_output_path
422446
self.s3_analysis_config_output_path = s3_analysis_config_output_path
@@ -571,11 +595,13 @@ def __init__(
571595
Cannot be set when ``endpoint_name`` is set.
572596
Must be set with ``instance_count``, ``model_name``
573597
accept_type (str): The model output format to be used for getting inferences with the
574-
shadow endpoint. Valid values are ``"text/csv"`` for CSV and
575-
``"application/jsonlines"``. Default is the same as ``content_type``.
598+
shadow endpoint. Valid values are ``"text/csv"`` for CSV,
599+
``"application/jsonlines"`` for JSON Lines, and ``"application/json"`` for JSON.
600+
Default is the same as ``content_type``.
576601
content_type (str): The model input format to be used for getting inferences with the
577602
shadow endpoint. Valid values are ``"text/csv"`` for CSV and
578-
``"application/jsonlines"``. Default is the same as ``dataset_format``.
603+
``"application/jsonlines"`` for JSON Lines. Default is the same as
604+
``dataset_format``.
579605
content_template (str): A template string to be used to construct the model input from
580606
dataset instances. It is only used when ``model_content_type`` is
581607
``"application/jsonlines"``. The template should have one and only one placeholder,
@@ -641,7 +667,7 @@ def __init__(
641667
)
642668
self.predictor_config["endpoint_name_prefix"] = endpoint_name_prefix
643669
if accept_type is not None:
644-
if accept_type not in ["text/csv", "application/jsonlines"]:
670+
if accept_type not in ["text/csv", "application/jsonlines", "application/json"]:
645671
raise ValueError(
646672
f"Invalid accept_type {accept_type}."
647673
f" Please choose text/csv or application/jsonlines."

src/sagemaker/feature_store/feature_group.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,9 @@ def ingest(
805805
if max_workers <= 0:
806806
raise RuntimeError("max_workers must be greater than 0.")
807807

808+
if profile_name is None and self.sagemaker_session.boto_session.profile_name != "default":
809+
profile_name = self.sagemaker_session.boto_session.profile_name
810+
808811
manager = IngestionManagerPandas(
809812
feature_group_name=self.name,
810813
sagemaker_session=self.sagemaker_session,

src/sagemaker/inference_recommender/inference_recommender_mixin.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -463,11 +463,9 @@ def _convert_to_endpoint_configurations_json(
463463
parameter_range.pop("instance_types")
464464

465465
for instance_type in instance_types:
466-
parameter_ranges = []
467-
for name, param in parameter_range.items():
468-
as_json = param.as_json_range(name)
469-
as_json["Value"] = as_json.pop("Values")
470-
parameter_ranges.append(as_json)
466+
parameter_ranges = [
467+
{"Name": name, "Value": param.values} for name, param in parameter_range.items()
468+
]
471469
endpoint_configurations_to_json.append(
472470
{
473471
"EnvironmentParameterRanges": {

src/sagemaker/pytorch/estimator.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,10 @@ def __init__(
171171
To learn more, see `Distributed PyTorch Training
172172
<https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training>`_.
173173
174-
**To enable Torch Distributed (for Trainium instances only):**
174+
**To enable Torch Distributed:**
175+
176+
This is available for general distributed training on
177+
GPU instances from PyTorch v1.13.1 and later.
175178
176179
.. code:: python
177180
@@ -181,6 +184,7 @@ def __init__(
181184
}
182185
}
183186
187+
This option also supports distributed training on Trn1.
184188
To learn more, see `Distributed PyTorch Training on Trainium
185189
<https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training-on-trainium>`_.
186190
@@ -210,9 +214,7 @@ def __init__(
210214
To learn more, see `Training with parameter servers
211215
<https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#training-with-parameter-servers>`_.
212216
213-
**To enable distributed training with
214-
`SageMaker Training Compiler <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
215-
for PyTorch:**
217+
**To enable distributed training with SageMaker Training Compiler:**
216218
217219
.. code:: python
218220

src/sagemaker/workflow/model_step.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ def _append_repack_model_step(self):
268268
depends_on=self.depends_on,
269269
retry_policies=self._repack_model_retry_policies,
270270
output_path=self._runtime_repack_output_prefix,
271+
output_kms_key=model.model_kms_key,
271272
)
272273
self.steps.append(repack_model_step)
273274

src/sagemaker/workflow/step_collections.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,7 @@ def __init__(
400400
security_group_ids=estimator.security_group_ids,
401401
description=description,
402402
display_name=display_name,
403+
output_kms_key=estimator.output_kms_key,
403404
)
404405
steps.append(repack_model_step)
405406
model_data = repack_model_step.properties.ModelArtifacts.S3ModelArtifacts

tests/unit/sagemaker/feature_store/test_feature_group.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ def test_ingest(ingestion_manager_init, sagemaker_session_mock, fs_runtime_clien
311311
sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock,
312312
max_workers=10,
313313
max_processes=1,
314-
profile_name=None,
314+
profile_name=sagemaker_session_mock.boto_session.profile_name,
315315
)
316316
mock_ingestion_manager_instance.run.assert_called_once_with(
317317
data_frame=df, wait=True, timeout=None
@@ -323,6 +323,7 @@ def test_ingest_default(ingestion_manager_init, sagemaker_session_mock):
323323
sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = (
324324
fs_runtime_client_config_mock
325325
)
326+
sagemaker_session_mock.boto_session.profile_name = "default"
326327

327328
feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
328329
df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300)))

0 commit comments

Comments
 (0)