feature: support JSON for input dataset and model output

Michael Trinh · Michael Trinh · commit cf15c0c26d54 · 2023-03-02T13:11:52.000-08:00
diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
@@ -49,6 +49,7 @@
             in (
                 "text/csv",
                 "application/jsonlines",
+                "application/json",
                 "application/sagemakercapturejson",
                 "application/x-parquet",
                 "application/x-image",
@@ -311,7 +312,7 @@ def __init__(
         s3_analysis_config_output_path: Optional[str] = None,
         label: Optional[str] = None,
         headers: Optional[List[str]] = None,
-        features: Optional[List[str]] = None,
+        features: Optional[str] = None,
         dataset_type: str = "text/csv",
         s3_compression_type: str = "None",
         joinsource: Optional[Union[str, int]] = None,
@@ -331,12 +332,18 @@ def __init__(
                 If this field is None, then the ``s3_output_path`` will be used
                 to store the ``analysis_config`` output.
             label (str): Target attribute of the model required by bias metrics. Specified as
-                column name or index for CSV dataset or as JMESPath expression for JSONLines.
+                column name or index for CSV dataset or a JMESPath expression for JSON/JSON Lines.
                 *Required parameter* except for when the input dataset does not contain the label.
-            features (List[str]): JMESPath expression to locate the feature columns for
-                bias metrics if the dataset format is JSONLines.
+                Note: For JSON, the JMESPath query must result in a list of labels for each
+                sample.  For JSON Lines, it must result in the label for each line.
+                Only a single label per sample is supported at this time.
+            features (str): JMESPath expression to locate the feature values
+                if the dataset format is JSON/JSON Lines.
+                Note: For JSON, the JMESPath query must result in a 2-D list (or a matrix) of
+                feature values.  For JSON Lines, it must result in a 1-D list of features for each
+                line.
             dataset_type (str): Format of the dataset. Valid values are ``"text/csv"`` for CSV,
-                ``"application/jsonlines"`` for JSONLines, and
+                ``"application/jsonlines"`` for JSON Lines, ``"application/json"`` for JSON, and
                 ``"application/x-parquet"`` for Parquet.
             s3_compression_type (str): Valid options are "None" or ``"Gzip"``.
             joinsource (str or int): The name or index of the column in the dataset that
@@ -359,6 +366,7 @@ def __init__(
 
                 Clarify will not use the ``joinsource`` column and columns present in the facet
                 dataset when calling model inference APIs.
+                Note: this is only supported for ``"text/csv"`` dataset type.
             facet_headers (list[str]): List of column names in the facet dataset.
             predicted_label_dataset_uri (str): Dataset S3 prefix/object URI with predicted labels,
                 which are used directly for analysis instead of making model inference API calls.
@@ -368,11 +376,16 @@ def __init__(
                 * If the dataset and predicted label dataset are in multiple files (either one),
                   then an index column, ``joinsource``, is required to join the two datasets.
 
+                Note: this is only supported for ``"text/csv"`` dataset type.
             predicted_label_headers (list[str]): List of column names in the predicted label dataset
             predicted_label (str or int): Predicted label of the target attribute of the model
-                required for running bias analysis. Specified as column name or index for CSV data.
+                required for running bias analysis. Specified as column name or index for CSV data,
+                or a JMESPath expression for JSON/JSON Lines.
                 Clarify uses the predicted labels directly instead of making model inference API
                 calls.
+                Note: For JSON, the JMESPath query must result in a list of predicted labels for
+                each sample.  For JSON Lines, it must result in the predicted label for each line.
+                Only a single predicted label per sample is supported at this time.
             excluded_columns (list[int] or list[str]): A list of names or indices of the columns
                 which are to be excluded from making model inference API calls.
 
@@ -384,15 +397,21 @@ def __init__(
         if dataset_type not in [
             "text/csv",
             "application/jsonlines",
+            "application/json",
             "application/x-parquet",
             "application/x-image",
         ]:
             raise ValueError(
                 f"Invalid dataset_type '{dataset_type}'."
                 f" Please check the API documentation for the supported dataset types."
             )
-        # parameters for analysis on datasets without facets are only supported for CSV datasets
-        if dataset_type != "text/csv":
+        # predicted_label and excluded_columns are only supported for tabular datasets
+        if dataset_type not in [
+            "text/csv",
+            "application/jsonlines",
+            "application/json",
+            "application/x-parquet",
+        ]:
             if predicted_label:
                 raise ValueError(
                     f"The parameter 'predicted_label' is not supported"
@@ -405,6 +424,8 @@ def __init__(
                     f" for dataset_type '{dataset_type}'."
                     f" Please check the API documentation for the supported dataset types."
                 )
+        # parameters for analysis on datasets without facets are only supported for CSV datasets
+        if dataset_type != "text/csv":
             if facet_dataset_uri or facet_headers:
                 raise ValueError(
                     f"The parameters 'facet_dataset_uri' and 'facet_headers'"
@@ -417,6 +438,9 @@ def __init__(
                     f" are not supported for dataset_type '{dataset_type}'."
                     f" Please check the API documentation for the supported dataset types."
                 )
+        # features JMESPath is required for JSON as we can't derive it ourselves
+        if dataset_type == "application/json" and features is None:
+            raise ValueError("features JMESPath is required for application/json dataset_type")
         self.s3_data_input_path = s3_data_input_path
         self.s3_output_path = s3_output_path
         self.s3_analysis_config_output_path = s3_analysis_config_output_path
@@ -571,11 +595,13 @@ def __init__(
                 Cannot be set when ``endpoint_name`` is set.
                 Must be set with ``instance_count``, ``model_name``
             accept_type (str): The model output format to be used for getting inferences with the
-                shadow endpoint. Valid values are ``"text/csv"`` for CSV and
-                ``"application/jsonlines"``. Default is the same as ``content_type``.
+                shadow endpoint. Valid values are ``"text/csv"`` for CSV,
+                ``"application/jsonlines"`` for JSON Lines, and ``"application/json"`` for JSON.
+                Default is the same as ``content_type``.
             content_type (str): The model input format to be used for getting inferences with the
                 shadow endpoint. Valid values are ``"text/csv"`` for CSV and
-                ``"application/jsonlines"``. Default is the same as ``dataset_format``.
+                ``"application/jsonlines"`` for JSON Lines. Default is the same as
+                ``dataset_format``.
             content_template (str): A template string to be used to construct the model input from
                 dataset instances. It is only used when ``model_content_type`` is
                 ``"application/jsonlines"``. The template should have one and only one placeholder,
@@ -641,7 +667,7 @@ def __init__(
                 )
             self.predictor_config["endpoint_name_prefix"] = endpoint_name_prefix
         if accept_type is not None:
-            if accept_type not in ["text/csv", "application/jsonlines"]:
+            if accept_type not in ["text/csv", "application/jsonlines", "application/json"]:
                 raise ValueError(
                     f"Invalid accept_type {accept_type}."
                     f" Please choose text/csv or application/jsonlines."
diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py
@@ -42,39 +42,57 @@ def test_uri():
     assert "306415355426.dkr.ecr.us-west-2.amazonaws.com/sagemaker-clarify-processing:1.0" == uri
 
 
-def test_data_config():
+@pytest.mark.parametrize(
+    ("dataset_type", "features", "excluded_columns", "predicted_label"),
+    [
+        ("text/csv", None, ["F4"], "Predicted Label"),
+        ("application/jsonlines", None, ["F4"], "Predicted Label"),
+        ("application/json", "[*].[F1,F2,F3]", ["F4"], "Predicted Label"),
+        ("application/x-parquet", None, ["F4"], "Predicted Label"),
+    ],
+)
+def test_data_config(dataset_type, features, excluded_columns, predicted_label):
     # facets in input dataset
     s3_data_input_path = "s3://path/to/input.csv"
     s3_output_path = "s3://path/to/output"
     label_name = "Label"
-    headers = [
-        "Label",
-        "F1",
-        "F2",
-        "F3",
-        "F4",
-    ]
-    dataset_type = "text/csv"
+    headers = ["Label", "F1", "F2", "F3", "F4", "Predicted Label"]
     data_config = DataConfig(
         s3_data_input_path=s3_data_input_path,
         s3_output_path=s3_output_path,
+        features=features,
         label=label_name,
         headers=headers,
         dataset_type=dataset_type,
+        excluded_columns=excluded_columns,
+        predicted_label=predicted_label,
     )
 
     expected_config = {
-        "dataset_type": "text/csv",
+        "dataset_type": dataset_type,
         "headers": headers,
         "label": "Label",
     }
+    if features:
+        expected_config["features"] = features
+    if excluded_columns:
+        expected_config["excluded_columns"] = excluded_columns
+    if predicted_label:
+        expected_config["predicted_label"] = predicted_label
 
     assert expected_config == data_config.get_config()
     assert s3_data_input_path == data_config.s3_data_input_path
     assert s3_output_path == data_config.s3_output_path
     assert "None" == data_config.s3_compression_type
     assert "FullyReplicated" == data_config.s3_data_distribution_type
 
+
+def test_data_config_with_separate_facet_dataset():
+    s3_data_input_path = "s3://path/to/input.csv"
+    s3_output_path = "s3://path/to/output"
+    label_name = "Label"
+    headers = ["Label", "F1", "F2", "F3", "F4"]
+
     # facets NOT in input dataset
     joinsource = 5
     facet_dataset_uri = "s3://path/to/facet.csv"
@@ -89,7 +107,7 @@ def test_data_config():
         s3_output_path=s3_output_path,
         label=label_name,
         headers=headers,
-        dataset_type=dataset_type,
+        dataset_type="text/csv",
         joinsource=joinsource,
         facet_dataset_uri=facet_dataset_uri,
         facet_headers=facet_headers,
@@ -126,7 +144,7 @@ def test_data_config():
         s3_output_path=s3_output_path,
         label=label_name,
         headers=headers,
-        dataset_type=dataset_type,
+        dataset_type="text/csv",
         joinsource=joinsource,
         excluded_columns=excluded_columns,
     )
@@ -158,7 +176,7 @@ def test_invalid_data_config():
         DataConfig(
             s3_data_input_path="s3://bucket/inputpath",
             s3_output_path="s3://bucket/outputpath",
-            dataset_type="application/x-parquet",
+            dataset_type="application/x-image",
             predicted_label="label",
         )
     error_msg = r"^The parameter 'excluded_columns' is not supported for dataset_type"
@@ -189,6 +207,27 @@ def test_invalid_data_config():
         )
 
 
+def test_json_type_data_config_missing_features():
+    # facets in input dataset
+    s3_data_input_path = "s3://path/to/input.csv"
+    s3_output_path = "s3://path/to/output"
+    label_name = "Label"
+    headers = ["Label", "F1", "F2", "F3", "F4", "Predicted Label"]
+    with pytest.raises(
+        ValueError, match="features JMESPath is required for application/json dataset_type"
+    ):
+        DataConfig(
+            s3_data_input_path=s3_data_input_path,
+            s3_output_path=s3_output_path,
+            features=None,
+            label=label_name,
+            headers=headers,
+            dataset_type="application/json",
+            excluded_columns=["F4"],
+            predicted_label="Predicted Label",
+        )
+
+
 def test_s3_data_distribution_type_ignorance():
     data_config = DataConfig(
         s3_data_input_path="s3://input/train.csv",
@@ -344,12 +383,25 @@ def test_facet_of_bias_config(facet_name, facet_values_or_threshold, expected_re
     assert bias_config.get_config() == expected_config
 
 
-def test_model_config():
+@pytest.mark.parametrize(
+    ("content_type", "accept_type"),
+    [
+        # All the combinations of content_type and accept_type should be acceptable
+        ("text/csv", "text/csv"),
+        ("application/jsonlines", "application/jsonlines"),
+        ("text/csv", "application/json"),
+        ("application/jsonlines", "application/json"),
+        ("application/jsonlines", "text/csv"),
+        ("image/jpeg", "text/csv"),
+        ("image/jpg", "text/csv"),
+        ("image/png", "text/csv"),
+        ("application/x-npy", "text/csv"),
+    ],
+)
+def test_valid_model_config(content_type, accept_type):
     model_name = "xgboost-model"
     instance_type = "ml.c5.xlarge"
     instance_count = 1
-    accept_type = "text/csv"
-    content_type = "application/jsonlines"
     custom_attributes = "c000b4f9-df62-4c85-a0bf-7c525f9104a4"
     target_model = "target_model_name"
     accelerator_type = "ml.eia1.medium"