Merge branch 'master' into update-hf-pt-train-dlc

JingyaHuang · web-flow · commit 798960328e91 · 2023-03-09T22:36:33.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # Changelog
 
+## v2.136.0 (2023-03-09)
+
+### Features
+
+ * with_feature_group [feature_store]
+ * Djl Large Model Support
+ * Decouple model.right_size() from model registry
+
+### Bug Fixes and Other Changes
+
+ * Fix integration test error in test_default_right_size_and_deploy_unregistered_base_model
+ * Add djl 0.21.0 dlc images
+
+### Documentation Changes
+
+ * Torchrun gpu support documentation change
+
 ## v2.135.1.post0 (2023-03-02)
 
 ### Documentation Changes
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.135.2.dev0
+2.136.1.dev0
diff --git a/doc/frameworks/pytorch/using_pytorch.rst b/doc/frameworks/pytorch/using_pytorch.rst
@@ -196,6 +196,7 @@ fit Optional Arguments
 -  ``logs``: Defaults to True, whether to show logs produced by training
    job in the Python session. Only meaningful when wait is True.
 
+----
 
 Distributed PyTorch Training
 ============================
@@ -262,16 +263,18 @@ during the PyTorch DDP initialization.
 
 .. note::
 
-  The SageMaker PyTorch estimator operates ``mpirun`` in the backend.
-  It doesn’t use ``torchrun`` for distributed training.
+  The SageMaker PyTorch estimator can operate both ``mpirun`` (for PyTorch 1.12.0 and later)
+  and ``torchrun`` (for PyTorch 1.13.1 and later) in the backend for distributed training.
 
 For more information about setting up PyTorch DDP in your training script,
 see `Getting Started with Distributed Data Parallel
 <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_ in the
 PyTorch documentation.
 
-The following example shows how to run a PyTorch DDP training in SageMaker
-using two ``ml.p4d.24xlarge`` instances:
+The following examples show how to set a PyTorch estimator
+to run a distributed training job on two ``ml.p4d.24xlarge`` instances.
+
+**Using PyTorch DDP with the mpirun backend**
 
 .. code:: python
 
@@ -291,7 +294,34 @@ using two ``ml.p4d.24xlarge`` instances:
         }
     )
 
-    pt_estimator.fit("s3://bucket/path/to/training/data")
+**Using PyTorch DDP with the torchrun backend**
+
+.. code:: python
+
+    from sagemaker.pytorch import PyTorch
+
+    pt_estimator = PyTorch(
+        entry_point="train_ptddp.py",
+        role="SageMakerRole",
+        framework_version="1.13.1",
+        py_version="py38",
+        instance_count=2,
+        instance_type="ml.p4d.24xlarge",
+        distribution={
+            "torch_distributed": {
+                "enabled": True
+            }
+        }
+    )
+
+
+.. note::
+
+    For more information about setting up ``torchrun`` in your training script,
+    see `torchrun (Elastic Launch) <https://pytorch.org/docs/stable/elastic/run.html>`_ in *the
+    PyTorch documentation*.
+
+----
 
 .. _distributed-pytorch-training-on-trainium:
 
@@ -324,7 +354,7 @@ with the ``torch_distributed`` option as the distribution strategy.
 
 .. note::
 
-  SageMaker Debugger is currently not supported with Trn1 instances.
+  SageMaker Debugger is not compatible with Trn1 instances.
 
 Adapt Your Training Script to Initialize with the XLA backend
 -------------------------------------------------------------
diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
@@ -49,6 +49,7 @@
             in (
                 "text/csv",
                 "application/jsonlines",
+                "application/json",
                 "application/sagemakercapturejson",
                 "application/x-parquet",
                 "application/x-image",
@@ -311,7 +312,7 @@ def __init__(
         s3_analysis_config_output_path: Optional[str] = None,
         label: Optional[str] = None,
         headers: Optional[List[str]] = None,
-        features: Optional[List[str]] = None,
+        features: Optional[str] = None,
         dataset_type: str = "text/csv",
         s3_compression_type: str = "None",
         joinsource: Optional[Union[str, int]] = None,
@@ -331,12 +332,18 @@ def __init__(
                 If this field is None, then the ``s3_output_path`` will be used
                 to store the ``analysis_config`` output.
             label (str): Target attribute of the model required by bias metrics. Specified as
-                column name or index for CSV dataset or as JMESPath expression for JSONLines.
+                column name or index for CSV dataset or a JMESPath expression for JSON/JSON Lines.
                 *Required parameter* except for when the input dataset does not contain the label.
-            features (List[str]): JMESPath expression to locate the feature columns for
-                bias metrics if the dataset format is JSONLines.
+                Note: For JSON, the JMESPath query must result in a list of labels for each
+                sample.  For JSON Lines, it must result in the label for each line.
+                Only a single label per sample is supported at this time.
+            features (str): JMESPath expression to locate the feature values
+                if the dataset format is JSON/JSON Lines.
+                Note: For JSON, the JMESPath query must result in a 2-D list (or a matrix) of
+                feature values.  For JSON Lines, it must result in a 1-D list of features for each
+                line.
             dataset_type (str): Format of the dataset. Valid values are ``"text/csv"`` for CSV,
-                ``"application/jsonlines"`` for JSONLines, and
+                ``"application/jsonlines"`` for JSON Lines, ``"application/json"`` for JSON, and
                 ``"application/x-parquet"`` for Parquet.
             s3_compression_type (str): Valid options are "None" or ``"Gzip"``.
             joinsource (str or int): The name or index of the column in the dataset that
@@ -359,6 +366,7 @@ def __init__(
 
                 Clarify will not use the ``joinsource`` column and columns present in the facet
                 dataset when calling model inference APIs.
+                Note: this is only supported for ``"text/csv"`` dataset type.
             facet_headers (list[str]): List of column names in the facet dataset.
             predicted_label_dataset_uri (str): Dataset S3 prefix/object URI with predicted labels,
                 which are used directly for analysis instead of making model inference API calls.
@@ -368,11 +376,16 @@ def __init__(
                 * If the dataset and predicted label dataset are in multiple files (either one),
                   then an index column, ``joinsource``, is required to join the two datasets.
 
+                Note: this is only supported for ``"text/csv"`` dataset type.
             predicted_label_headers (list[str]): List of column names in the predicted label dataset
             predicted_label (str or int): Predicted label of the target attribute of the model
-                required for running bias analysis. Specified as column name or index for CSV data.
+                required for running bias analysis. Specified as column name or index for CSV data,
+                or a JMESPath expression for JSON/JSON Lines.
                 Clarify uses the predicted labels directly instead of making model inference API
                 calls.
+                Note: For JSON, the JMESPath query must result in a list of predicted labels for
+                each sample.  For JSON Lines, it must result in the predicted label for each line.
+                Only a single predicted label per sample is supported at this time.
             excluded_columns (list[int] or list[str]): A list of names or indices of the columns
                 which are to be excluded from making model inference API calls.
 
@@ -384,15 +397,21 @@ def __init__(
         if dataset_type not in [
             "text/csv",
             "application/jsonlines",
+            "application/json",
             "application/x-parquet",
             "application/x-image",
         ]:
             raise ValueError(
                 f"Invalid dataset_type '{dataset_type}'."
                 f" Please check the API documentation for the supported dataset types."
             )
-        # parameters for analysis on datasets without facets are only supported for CSV datasets
-        if dataset_type != "text/csv":
+        # predicted_label and excluded_columns are only supported for tabular datasets
+        if dataset_type not in [
+            "text/csv",
+            "application/jsonlines",
+            "application/json",
+            "application/x-parquet",
+        ]:
             if predicted_label:
                 raise ValueError(
                     f"The parameter 'predicted_label' is not supported"
@@ -405,6 +424,8 @@ def __init__(
                     f" for dataset_type '{dataset_type}'."
                     f" Please check the API documentation for the supported dataset types."
                 )
+        # parameters for analysis on datasets without facets are only supported for CSV datasets
+        if dataset_type != "text/csv":
             if facet_dataset_uri or facet_headers:
                 raise ValueError(
                     f"The parameters 'facet_dataset_uri' and 'facet_headers'"
@@ -417,6 +438,9 @@ def __init__(
                     f" are not supported for dataset_type '{dataset_type}'."
                     f" Please check the API documentation for the supported dataset types."
                 )
+        # features JMESPath is required for JSON as we can't derive it ourselves
+        if dataset_type == "application/json" and features is None:
+            raise ValueError("features JMESPath is required for application/json dataset_type")
         self.s3_data_input_path = s3_data_input_path
         self.s3_output_path = s3_output_path
         self.s3_analysis_config_output_path = s3_analysis_config_output_path
@@ -571,11 +595,13 @@ def __init__(
                 Cannot be set when ``endpoint_name`` is set.
                 Must be set with ``instance_count``, ``model_name``
             accept_type (str): The model output format to be used for getting inferences with the
-                shadow endpoint. Valid values are ``"text/csv"`` for CSV and
-                ``"application/jsonlines"``. Default is the same as ``content_type``.
+                shadow endpoint. Valid values are ``"text/csv"`` for CSV,
+                ``"application/jsonlines"`` for JSON Lines, and ``"application/json"`` for JSON.
+                Default is the same as ``content_type``.
             content_type (str): The model input format to be used for getting inferences with the
                 shadow endpoint. Valid values are ``"text/csv"`` for CSV and
-                ``"application/jsonlines"``. Default is the same as ``dataset_format``.
+                ``"application/jsonlines"`` for JSON Lines. Default is the same as
+                ``dataset_format``.
             content_template (str): A template string to be used to construct the model input from
                 dataset instances. It is only used when ``model_content_type`` is
                 ``"application/jsonlines"``. The template should have one and only one placeholder,
@@ -641,7 +667,7 @@ def __init__(
                 )
             self.predictor_config["endpoint_name_prefix"] = endpoint_name_prefix
         if accept_type is not None:
-            if accept_type not in ["text/csv", "application/jsonlines"]:
+            if accept_type not in ["text/csv", "application/jsonlines", "application/json"]:
                 raise ValueError(
                     f"Invalid accept_type {accept_type}."
                     f" Please choose text/csv or application/jsonlines."
diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py
@@ -805,6 +805,9 @@ def ingest(
         if max_workers <= 0:
             raise RuntimeError("max_workers must be greater than 0.")
 
+        if profile_name is None and self.sagemaker_session.boto_session.profile_name != "default":
+            profile_name = self.sagemaker_session.boto_session.profile_name
+
         manager = IngestionManagerPandas(
             feature_group_name=self.name,
             sagemaker_session=self.sagemaker_session,
diff --git a/src/sagemaker/inference_recommender/inference_recommender_mixin.py b/src/sagemaker/inference_recommender/inference_recommender_mixin.py
@@ -463,11 +463,9 @@ def _convert_to_endpoint_configurations_json(
             parameter_range.pop("instance_types")
 
             for instance_type in instance_types:
-                parameter_ranges = []
-                for name, param in parameter_range.items():
-                    as_json = param.as_json_range(name)
-                    as_json["Value"] = as_json.pop("Values")
-                    parameter_ranges.append(as_json)
+                parameter_ranges = [
+                    {"Name": name, "Value": param.values} for name, param in parameter_range.items()
+                ]
                 endpoint_configurations_to_json.append(
                     {
                         "EnvironmentParameterRanges": {
diff --git a/src/sagemaker/pytorch/estimator.py b/src/sagemaker/pytorch/estimator.py
@@ -171,7 +171,10 @@ def __init__(
                     To learn more, see `Distributed PyTorch Training
                     <https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training>`_.
 
-                **To enable Torch Distributed (for Trainium instances only):**
+                **To enable Torch Distributed:**
+
+                    This is available for general distributed training on
+                    GPU instances from PyTorch v1.13.1 and later.
 
                     .. code:: python
 
@@ -181,6 +184,7 @@ def __init__(
                             }
                         }
 
+                    This option also supports distributed training on Trn1.
                     To learn more, see `Distributed PyTorch Training on Trainium
                     <https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training-on-trainium>`_.
 
@@ -210,9 +214,7 @@ def __init__(
                     To learn more, see `Training with parameter servers
                     <https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#training-with-parameter-servers>`_.
 
-                **To enable distributed training with
-                `SageMaker Training Compiler <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
-                for PyTorch:**
+                **To enable distributed training with SageMaker Training Compiler:**
 
                     .. code:: python
 
diff --git a/src/sagemaker/workflow/model_step.py b/src/sagemaker/workflow/model_step.py
@@ -268,6 +268,7 @@ def _append_repack_model_step(self):
                     depends_on=self.depends_on,
                     retry_policies=self._repack_model_retry_policies,
                     output_path=self._runtime_repack_output_prefix,
+                    output_kms_key=model.model_kms_key,
                 )
                 self.steps.append(repack_model_step)
 
diff --git a/src/sagemaker/workflow/step_collections.py b/src/sagemaker/workflow/step_collections.py
@@ -400,6 +400,7 @@ def __init__(
                 security_group_ids=estimator.security_group_ids,
                 description=description,
                 display_name=display_name,
+                output_kms_key=estimator.output_kms_key,
             )
             steps.append(repack_model_step)
             model_data = repack_model_step.properties.ModelArtifacts.S3ModelArtifacts
diff --git a/tests/unit/sagemaker/feature_store/test_feature_group.py b/tests/unit/sagemaker/feature_store/test_feature_group.py
@@ -311,7 +311,7 @@ def test_ingest(ingestion_manager_init, sagemaker_session_mock, fs_runtime_clien
         sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock,
         max_workers=10,
         max_processes=1,
-        profile_name=None,
+        profile_name=sagemaker_session_mock.boto_session.profile_name,
     )
     mock_ingestion_manager_instance.run.assert_called_once_with(
         data_frame=df, wait=True, timeout=None
@@ -323,6 +323,7 @@ def test_ingest_default(ingestion_manager_init, sagemaker_session_mock):
     sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = (
         fs_runtime_client_config_mock
     )
+    sagemaker_session_mock.boto_session.profile_name = "default"
 
     feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
     df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300)))
diff --git a/tests/unit/sagemaker/inference_recommender/test_inference_recommender_mixin.py b/tests/unit/sagemaker/inference_recommender/test_inference_recommender_mixin.py
diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py

Original file line number	Diff line number	Diff line change
`@@ -268,6 +268,7 @@ def _append_repack_model_step(self):`
`268`	`268`	`depends_on=self.depends_on,`
`269`	`269`	`retry_policies=self._repack_model_retry_policies,`
`270`	`270`	`output_path=self._runtime_repack_output_prefix,`
	`271`	`+ output_kms_key=model.model_kms_key,`
`271`	`272`	`)`
`272`	`273`	`self.steps.append(repack_model_step)`
`273`	`274`
Original file line number	Diff line number	Diff line change
`@@ -400,6 +400,7 @@ def __init__(`
`400`	`400`	`security_group_ids=estimator.security_group_ids,`
`401`	`401`	`description=description,`
`402`	`402`	`display_name=display_name,`
	`403`	`+ output_kms_key=estimator.output_kms_key,`
`403`	`404`	`)`
`404`	`405`	`steps.append(repack_model_step)`
`405`	`406`	`model_data = repack_model_step.properties.ModelArtifacts.S3ModelArtifacts`