49
49
in (
50
50
"text/csv" ,
51
51
"application/jsonlines" ,
52
+ "application/json" ,
52
53
"application/sagemakercapturejson" ,
53
54
"application/x-parquet" ,
54
55
"application/x-image" ,
@@ -311,7 +312,7 @@ def __init__(
311
312
s3_analysis_config_output_path : Optional [str ] = None ,
312
313
label : Optional [str ] = None ,
313
314
headers : Optional [List [str ]] = None ,
314
- features : Optional [List [ str ] ] = None ,
315
+ features : Optional [str ] = None ,
315
316
dataset_type : str = "text/csv" ,
316
317
s3_compression_type : str = "None" ,
317
318
joinsource : Optional [Union [str , int ]] = None ,
@@ -331,12 +332,18 @@ def __init__(
331
332
If this field is None, then the ``s3_output_path`` will be used
332
333
to store the ``analysis_config`` output.
333
334
label (str): Target attribute of the model required by bias metrics. Specified as
334
- column name or index for CSV dataset or as JMESPath expression for JSONLines .
335
+ column name or index for CSV dataset or a JMESPath expression for JSON/JSON Lines .
335
336
*Required parameter* except for when the input dataset does not contain the label.
336
- features (List[str]): JMESPath expression to locate the feature columns for
337
- bias metrics if the dataset format is JSONLines.
337
+ Note: For JSON, the JMESPath query must result in a list of labels for each
338
+ sample. For JSON Lines, it must result in the label for each line.
339
+ Only a single label per sample is supported at this time.
340
+ features (str): JMESPath expression to locate the feature values
341
+ if the dataset format is JSON/JSON Lines.
342
+ Note: For JSON, the JMESPath query must result in a 2-D list (or a matrix) of
343
+ feature values. For JSON Lines, it must result in a 1-D list of features for each
344
+ line.
338
345
dataset_type (str): Format of the dataset. Valid values are ``"text/csv"`` for CSV,
339
- ``"application/jsonlines"`` for JSONLines , and
346
+ ``"application/jsonlines"`` for JSON Lines, ``"application/json"`` for JSON , and
340
347
``"application/x-parquet"`` for Parquet.
341
348
s3_compression_type (str): Valid options are "None" or ``"Gzip"``.
342
349
joinsource (str or int): The name or index of the column in the dataset that
@@ -359,6 +366,7 @@ def __init__(
359
366
360
367
Clarify will not use the ``joinsource`` column and columns present in the facet
361
368
dataset when calling model inference APIs.
369
+ Note: this is only supported for ``"text/csv"`` dataset type.
362
370
facet_headers (list[str]): List of column names in the facet dataset.
363
371
predicted_label_dataset_uri (str): Dataset S3 prefix/object URI with predicted labels,
364
372
which are used directly for analysis instead of making model inference API calls.
@@ -368,11 +376,16 @@ def __init__(
368
376
* If the dataset and predicted label dataset are in multiple files (either one),
369
377
then an index column, ``joinsource``, is required to join the two datasets.
370
378
379
+ Note: this is only supported for ``"text/csv"`` dataset type.
371
380
predicted_label_headers (list[str]): List of column names in the predicted label dataset
372
381
predicted_label (str or int): Predicted label of the target attribute of the model
373
- required for running bias analysis. Specified as column name or index for CSV data.
382
+ required for running bias analysis. Specified as column name or index for CSV data,
383
+ or a JMESPath expression for JSON/JSON Lines.
374
384
Clarify uses the predicted labels directly instead of making model inference API
375
385
calls.
386
+ Note: For JSON, the JMESPath query must result in a list of predicted labels for
387
+ each sample. For JSON Lines, it must result in the predicted label for each line.
388
+ Only a single predicted label per sample is supported at this time.
376
389
excluded_columns (list[int] or list[str]): A list of names or indices of the columns
377
390
which are to be excluded from making model inference API calls.
378
391
@@ -384,15 +397,21 @@ def __init__(
384
397
if dataset_type not in [
385
398
"text/csv" ,
386
399
"application/jsonlines" ,
400
+ "application/json" ,
387
401
"application/x-parquet" ,
388
402
"application/x-image" ,
389
403
]:
390
404
raise ValueError (
391
405
f"Invalid dataset_type '{ dataset_type } '."
392
406
f" Please check the API documentation for the supported dataset types."
393
407
)
394
- # parameters for analysis on datasets without facets are only supported for CSV datasets
395
- if dataset_type != "text/csv" :
408
+ # predicted_label and excluded_columns are only supported for tabular datasets
409
+ if dataset_type not in [
410
+ "text/csv" ,
411
+ "application/jsonlines" ,
412
+ "application/json" ,
413
+ "application/x-parquet" ,
414
+ ]:
396
415
if predicted_label :
397
416
raise ValueError (
398
417
f"The parameter 'predicted_label' is not supported"
@@ -405,6 +424,8 @@ def __init__(
405
424
f" for dataset_type '{ dataset_type } '."
406
425
f" Please check the API documentation for the supported dataset types."
407
426
)
427
+ # parameters for analysis on datasets without facets are only supported for CSV datasets
428
+ if dataset_type != "text/csv" :
408
429
if facet_dataset_uri or facet_headers :
409
430
raise ValueError (
410
431
f"The parameters 'facet_dataset_uri' and 'facet_headers'"
@@ -417,6 +438,9 @@ def __init__(
417
438
f" are not supported for dataset_type '{ dataset_type } '."
418
439
f" Please check the API documentation for the supported dataset types."
419
440
)
441
+ # features JMESPath is required for JSON as we can't derive it ourselves
442
+ if dataset_type == "application/json" and features is None :
443
+ raise ValueError ("features JMESPath is required for application/json dataset_type" )
420
444
self .s3_data_input_path = s3_data_input_path
421
445
self .s3_output_path = s3_output_path
422
446
self .s3_analysis_config_output_path = s3_analysis_config_output_path
@@ -571,11 +595,13 @@ def __init__(
571
595
Cannot be set when ``endpoint_name`` is set.
572
596
Must be set with ``instance_count``, ``model_name``
573
597
accept_type (str): The model output format to be used for getting inferences with the
574
- shadow endpoint. Valid values are ``"text/csv"`` for CSV and
575
- ``"application/jsonlines"``. Default is the same as ``content_type``.
598
+ shadow endpoint. Valid values are ``"text/csv"`` for CSV,
599
+ ``"application/jsonlines"`` for JSON Lines, and ``"application/json"`` for JSON.
600
+ Default is the same as ``content_type``.
576
601
content_type (str): The model input format to be used for getting inferences with the
577
602
shadow endpoint. Valid values are ``"text/csv"`` for CSV and
578
- ``"application/jsonlines"``. Default is the same as ``dataset_format``.
603
+ ``"application/jsonlines"`` for JSON Lines. Default is the same as
604
+ ``dataset_format``.
579
605
content_template (str): A template string to be used to construct the model input from
580
606
dataset instances. It is only used when ``model_content_type`` is
581
607
``"application/jsonlines"``. The template should have one and only one placeholder,
@@ -641,7 +667,7 @@ def __init__(
641
667
)
642
668
self .predictor_config ["endpoint_name_prefix" ] = endpoint_name_prefix
643
669
if accept_type is not None :
644
- if accept_type not in ["text/csv" , "application/jsonlines" ]:
670
+ if accept_type not in ["text/csv" , "application/jsonlines" , "application/json" ]:
645
671
raise ValueError (
646
672
f"Invalid accept_type { accept_type } ."
647
673
f" Please choose text/csv or application/jsonlines."
0 commit comments