Merge branch 'master' into xgb-1.7-1_launch

NikhilRaverkar · web-flow · commit 5aebd2d53976 · 2023-03-10T21:04:33.000+05:30
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## v2.137.0 (2023-03-10)
+
+### Features
+
+ * support JSON for input dataset and model output
+
+### Bug Fixes and Other Changes
+
+ * Wait on describe for tag propagation
+ * Extracted profile_name directly from sagemaker.Session if None
+ * Avoid double encoding to JSON in InferenceRecommenderMixin
+ * RepackStep must use the same KMS key as the Model
+
 ## v2.136.0 (2023-03-09)
 
 ### Features
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.136.1.dev0
+2.137.1.dev0
diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
@@ -49,6 +49,7 @@
             in (
                 "text/csv",
                 "application/jsonlines",
+                "application/json",
                 "application/sagemakercapturejson",
                 "application/x-parquet",
                 "application/x-image",
@@ -311,7 +312,7 @@ def __init__(
         s3_analysis_config_output_path: Optional[str] = None,
         label: Optional[str] = None,
         headers: Optional[List[str]] = None,
-        features: Optional[List[str]] = None,
+        features: Optional[str] = None,
         dataset_type: str = "text/csv",
         s3_compression_type: str = "None",
         joinsource: Optional[Union[str, int]] = None,
@@ -331,12 +332,18 @@ def __init__(
                 If this field is None, then the ``s3_output_path`` will be used
                 to store the ``analysis_config`` output.
             label (str): Target attribute of the model required by bias metrics. Specified as
-                column name or index for CSV dataset or as JMESPath expression for JSONLines.
+                column name or index for CSV dataset or a JMESPath expression for JSON/JSON Lines.
                 *Required parameter* except for when the input dataset does not contain the label.
-            features (List[str]): JMESPath expression to locate the feature columns for
-                bias metrics if the dataset format is JSONLines.
+                Note: For JSON, the JMESPath query must result in a list of labels for each
+                sample.  For JSON Lines, it must result in the label for each line.
+                Only a single label per sample is supported at this time.
+            features (str): JMESPath expression to locate the feature values
+                if the dataset format is JSON/JSON Lines.
+                Note: For JSON, the JMESPath query must result in a 2-D list (or a matrix) of
+                feature values.  For JSON Lines, it must result in a 1-D list of features for each
+                line.
             dataset_type (str): Format of the dataset. Valid values are ``"text/csv"`` for CSV,
-                ``"application/jsonlines"`` for JSONLines, and
+                ``"application/jsonlines"`` for JSON Lines, ``"application/json"`` for JSON, and
                 ``"application/x-parquet"`` for Parquet.
             s3_compression_type (str): Valid options are "None" or ``"Gzip"``.
             joinsource (str or int): The name or index of the column in the dataset that
@@ -359,6 +366,7 @@ def __init__(
 
                 Clarify will not use the ``joinsource`` column and columns present in the facet
                 dataset when calling model inference APIs.
+                Note: this is only supported for ``"text/csv"`` dataset type.
             facet_headers (list[str]): List of column names in the facet dataset.
             predicted_label_dataset_uri (str): Dataset S3 prefix/object URI with predicted labels,
                 which are used directly for analysis instead of making model inference API calls.
@@ -368,11 +376,16 @@ def __init__(
                 * If the dataset and predicted label dataset are in multiple files (either one),
                   then an index column, ``joinsource``, is required to join the two datasets.
 
+                Note: this is only supported for ``"text/csv"`` dataset type.
             predicted_label_headers (list[str]): List of column names in the predicted label dataset
             predicted_label (str or int): Predicted label of the target attribute of the model
-                required for running bias analysis. Specified as column name or index for CSV data.
+                required for running bias analysis. Specified as column name or index for CSV data,
+                or a JMESPath expression for JSON/JSON Lines.
                 Clarify uses the predicted labels directly instead of making model inference API
                 calls.
+                Note: For JSON, the JMESPath query must result in a list of predicted labels for
+                each sample.  For JSON Lines, it must result in the predicted label for each line.
+                Only a single predicted label per sample is supported at this time.
             excluded_columns (list[int] or list[str]): A list of names or indices of the columns
                 which are to be excluded from making model inference API calls.
 
@@ -384,15 +397,21 @@ def __init__(
         if dataset_type not in [
             "text/csv",
             "application/jsonlines",
+            "application/json",
             "application/x-parquet",
             "application/x-image",
         ]:
             raise ValueError(
                 f"Invalid dataset_type '{dataset_type}'."
                 f" Please check the API documentation for the supported dataset types."
             )
-        # parameters for analysis on datasets without facets are only supported for CSV datasets
-        if dataset_type != "text/csv":
+        # predicted_label and excluded_columns are only supported for tabular datasets
+        if dataset_type not in [
+            "text/csv",
+            "application/jsonlines",
+            "application/json",
+            "application/x-parquet",
+        ]:
             if predicted_label:
                 raise ValueError(
                     f"The parameter 'predicted_label' is not supported"
@@ -405,6 +424,8 @@ def __init__(
                     f" for dataset_type '{dataset_type}'."
                     f" Please check the API documentation for the supported dataset types."
                 )
+        # parameters for analysis on datasets without facets are only supported for CSV datasets
+        if dataset_type != "text/csv":
             if facet_dataset_uri or facet_headers:
                 raise ValueError(
                     f"The parameters 'facet_dataset_uri' and 'facet_headers'"
@@ -417,6 +438,9 @@ def __init__(
                     f" are not supported for dataset_type '{dataset_type}'."
                     f" Please check the API documentation for the supported dataset types."
                 )
+        # features JMESPath is required for JSON as we can't derive it ourselves
+        if dataset_type == "application/json" and features is None:
+            raise ValueError("features JMESPath is required for application/json dataset_type")
         self.s3_data_input_path = s3_data_input_path
         self.s3_output_path = s3_output_path
         self.s3_analysis_config_output_path = s3_analysis_config_output_path
@@ -571,11 +595,13 @@ def __init__(
                 Cannot be set when ``endpoint_name`` is set.
                 Must be set with ``instance_count``, ``model_name``
             accept_type (str): The model output format to be used for getting inferences with the
-                shadow endpoint. Valid values are ``"text/csv"`` for CSV and
-                ``"application/jsonlines"``. Default is the same as ``content_type``.
+                shadow endpoint. Valid values are ``"text/csv"`` for CSV,
+                ``"application/jsonlines"`` for JSON Lines, and ``"application/json"`` for JSON.
+                Default is the same as ``content_type``.
             content_type (str): The model input format to be used for getting inferences with the
                 shadow endpoint. Valid values are ``"text/csv"`` for CSV and
-                ``"application/jsonlines"``. Default is the same as ``dataset_format``.
+                ``"application/jsonlines"`` for JSON Lines. Default is the same as
+                ``dataset_format``.
             content_template (str): A template string to be used to construct the model input from
                 dataset instances. It is only used when ``model_content_type`` is
                 ``"application/jsonlines"``. The template should have one and only one placeholder,
@@ -641,7 +667,7 @@ def __init__(
                 )
             self.predictor_config["endpoint_name_prefix"] = endpoint_name_prefix
         if accept_type is not None:
-            if accept_type not in ["text/csv", "application/jsonlines"]:
+            if accept_type not in ["text/csv", "application/jsonlines", "application/json"]:
                 raise ValueError(
                     f"Invalid accept_type {accept_type}."
                     f" Please choose text/csv or application/jsonlines."
diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py
@@ -805,6 +805,9 @@ def ingest(
         if max_workers <= 0:
             raise RuntimeError("max_workers must be greater than 0.")
 
+        if profile_name is None and self.sagemaker_session.boto_session.profile_name != "default":
+            profile_name = self.sagemaker_session.boto_session.profile_name
+
         manager = IngestionManagerPandas(
             feature_group_name=self.name,
             sagemaker_session=self.sagemaker_session,
diff --git a/src/sagemaker/inference_recommender/inference_recommender_mixin.py b/src/sagemaker/inference_recommender/inference_recommender_mixin.py
@@ -463,11 +463,9 @@ def _convert_to_endpoint_configurations_json(
             parameter_range.pop("instance_types")
 
             for instance_type in instance_types:
-                parameter_ranges = []
-                for name, param in parameter_range.items():
-                    as_json = param.as_json_range(name)
-                    as_json["Value"] = as_json.pop("Values")
-                    parameter_ranges.append(as_json)
+                parameter_ranges = [
+                    {"Name": name, "Value": param.values} for name, param in parameter_range.items()
+                ]
                 endpoint_configurations_to_json.append(
                     {
                         "EnvironmentParameterRanges": {
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -2029,7 +2029,7 @@ def logs_for_auto_ml_job(  # noqa: C901 - suppress complexity warning for this m
             exceptions.UnexpectedStatusException: If waiting and auto ml job fails.
         """
 
-        description = self.sagemaker_client.describe_auto_ml_job(AutoMLJobName=job_name)
+        description = _wait_until(lambda: self.describe_auto_ml_job(job_name), poll)
 
         instance_count, stream_names, positions, client, log_group, dot, color_wrap = _logs_init(
             self, description, job="AutoML"
@@ -4113,7 +4113,7 @@ def logs_for_job(  # noqa: C901 - suppress complexity warning for this method
             exceptions.UnexpectedStatusException: If waiting and the training job fails.
         """
 
-        description = self.sagemaker_client.describe_training_job(TrainingJobName=job_name)
+        description = _wait_until(lambda: self.describe_training_job(job_name), poll)
         print(secondary_training_status_message(description, None), end="")
 
         instance_count, stream_names, positions, client, log_group, dot, color_wrap = _logs_init(
@@ -4240,7 +4240,7 @@ def logs_for_processing_job(self, job_name, wait=False, poll=10):
             ValueError: If the processing job fails.
         """
 
-        description = self.sagemaker_client.describe_processing_job(ProcessingJobName=job_name)
+        description = _wait_until(lambda: self.describe_processing_job(job_name), poll)
 
         instance_count, stream_names, positions, client, log_group, dot, color_wrap = _logs_init(
             self, description, job="Processing"
@@ -4321,7 +4321,7 @@ def logs_for_transform_job(self, job_name, wait=False, poll=10):
             ValueError: If the transform job fails.
         """
 
-        description = self.sagemaker_client.describe_transform_job(TransformJobName=job_name)
+        description = _wait_until(lambda: self.describe_transform_job(job_name), poll)
 
         instance_count, stream_names, positions, client, log_group, dot, color_wrap = _logs_init(
             self, description, job="Transform"
diff --git a/src/sagemaker/workflow/model_step.py b/src/sagemaker/workflow/model_step.py
@@ -268,6 +268,7 @@ def _append_repack_model_step(self):
                     depends_on=self.depends_on,
                     retry_policies=self._repack_model_retry_policies,
                     output_path=self._runtime_repack_output_prefix,
+                    output_kms_key=model.model_kms_key,
                 )
                 self.steps.append(repack_model_step)
 
diff --git a/src/sagemaker/workflow/step_collections.py b/src/sagemaker/workflow/step_collections.py
@@ -400,6 +400,7 @@ def __init__(
                 security_group_ids=estimator.security_group_ids,
                 description=description,
                 display_name=display_name,
+                output_kms_key=estimator.output_kms_key,
             )
             steps.append(repack_model_step)
             model_data = repack_model_step.properties.ModelArtifacts.S3ModelArtifacts
diff --git a/tests/unit/sagemaker/feature_store/test_feature_group.py b/tests/unit/sagemaker/feature_store/test_feature_group.py
@@ -311,7 +311,7 @@ def test_ingest(ingestion_manager_init, sagemaker_session_mock, fs_runtime_clien
         sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock,
         max_workers=10,
         max_processes=1,
-        profile_name=None,
+        profile_name=sagemaker_session_mock.boto_session.profile_name,
     )
     mock_ingestion_manager_instance.run.assert_called_once_with(
         data_frame=df, wait=True, timeout=None
@@ -323,6 +323,7 @@ def test_ingest_default(ingestion_manager_init, sagemaker_session_mock):
     sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = (
         fs_runtime_client_config_mock
     )
+    sagemaker_session_mock.boto_session.profile_name = "default"
 
     feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
     df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300)))
diff --git a/tests/unit/sagemaker/inference_recommender/test_inference_recommender_mixin.py b/tests/unit/sagemaker/inference_recommender/test_inference_recommender_mixin.py
diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py

Original file line number	Diff line number	Diff line change
`@@ -268,6 +268,7 @@ def _append_repack_model_step(self):`
`268`	`268`	`depends_on=self.depends_on,`
`269`	`269`	`retry_policies=self._repack_model_retry_policies,`
`270`	`270`	`output_path=self._runtime_repack_output_prefix,`
	`271`	`+ output_kms_key=model.model_kms_key,`
`271`	`272`	`)`
`272`	`273`	`self.steps.append(repack_model_step)`
`273`	`274`
Original file line number	Diff line number	Diff line change
`@@ -400,6 +400,7 @@ def __init__(`
`400`	`400`	`security_group_ids=estimator.security_group_ids,`
`401`	`401`	`description=description,`
`402`	`402`	`display_name=display_name,`
	`403`	`+ output_kms_key=estimator.output_kms_key,`
`403`	`404`	`)`
`404`	`405`	`steps.append(repack_model_step)`
`405`	`406`	`model_data = repack_model_step.properties.ModelArtifacts.S3ModelArtifacts`