@@ -42,39 +42,57 @@ def test_uri():
42
42
assert "306415355426.dkr.ecr.us-west-2.amazonaws.com/sagemaker-clarify-processing:1.0" == uri
43
43
44
44
45
- def test_data_config ():
45
+ @pytest .mark .parametrize (
46
+ ("dataset_type" , "features" , "excluded_columns" , "predicted_label" ),
47
+ [
48
+ ("text/csv" , None , ["F4" ], "Predicted Label" ),
49
+ ("application/jsonlines" , None , ["F4" ], "Predicted Label" ),
50
+ ("application/json" , "[*].[F1,F2,F3]" , ["F4" ], "Predicted Label" ),
51
+ ("application/x-parquet" , None , ["F4" ], "Predicted Label" ),
52
+ ],
53
+ )
54
+ def test_data_config (dataset_type , features , excluded_columns , predicted_label ):
46
55
# facets in input dataset
47
56
s3_data_input_path = "s3://path/to/input.csv"
48
57
s3_output_path = "s3://path/to/output"
49
58
label_name = "Label"
50
- headers = [
51
- "Label" ,
52
- "F1" ,
53
- "F2" ,
54
- "F3" ,
55
- "F4" ,
56
- ]
57
- dataset_type = "text/csv"
59
+ headers = ["Label" , "F1" , "F2" , "F3" , "F4" , "Predicted Label" ]
58
60
data_config = DataConfig (
59
61
s3_data_input_path = s3_data_input_path ,
60
62
s3_output_path = s3_output_path ,
63
+ features = features ,
61
64
label = label_name ,
62
65
headers = headers ,
63
66
dataset_type = dataset_type ,
67
+ excluded_columns = excluded_columns ,
68
+ predicted_label = predicted_label ,
64
69
)
65
70
66
71
expected_config = {
67
- "dataset_type" : "text/csv" ,
72
+ "dataset_type" : dataset_type ,
68
73
"headers" : headers ,
69
74
"label" : "Label" ,
70
75
}
76
+ if features :
77
+ expected_config ["features" ] = features
78
+ if excluded_columns :
79
+ expected_config ["excluded_columns" ] = excluded_columns
80
+ if predicted_label :
81
+ expected_config ["predicted_label" ] = predicted_label
71
82
72
83
assert expected_config == data_config .get_config ()
73
84
assert s3_data_input_path == data_config .s3_data_input_path
74
85
assert s3_output_path == data_config .s3_output_path
75
86
assert "None" == data_config .s3_compression_type
76
87
assert "FullyReplicated" == data_config .s3_data_distribution_type
77
88
89
+
90
+ def test_data_config_with_separate_facet_dataset ():
91
+ s3_data_input_path = "s3://path/to/input.csv"
92
+ s3_output_path = "s3://path/to/output"
93
+ label_name = "Label"
94
+ headers = ["Label" , "F1" , "F2" , "F3" , "F4" ]
95
+
78
96
# facets NOT in input dataset
79
97
joinsource = 5
80
98
facet_dataset_uri = "s3://path/to/facet.csv"
@@ -89,7 +107,7 @@ def test_data_config():
89
107
s3_output_path = s3_output_path ,
90
108
label = label_name ,
91
109
headers = headers ,
92
- dataset_type = dataset_type ,
110
+ dataset_type = "text/csv" ,
93
111
joinsource = joinsource ,
94
112
facet_dataset_uri = facet_dataset_uri ,
95
113
facet_headers = facet_headers ,
@@ -126,7 +144,7 @@ def test_data_config():
126
144
s3_output_path = s3_output_path ,
127
145
label = label_name ,
128
146
headers = headers ,
129
- dataset_type = dataset_type ,
147
+ dataset_type = "text/csv" ,
130
148
joinsource = joinsource ,
131
149
excluded_columns = excluded_columns ,
132
150
)
@@ -158,7 +176,7 @@ def test_invalid_data_config():
158
176
DataConfig (
159
177
s3_data_input_path = "s3://bucket/inputpath" ,
160
178
s3_output_path = "s3://bucket/outputpath" ,
161
- dataset_type = "application/x-parquet " ,
179
+ dataset_type = "application/x-image " ,
162
180
predicted_label = "label" ,
163
181
)
164
182
error_msg = r"^The parameter 'excluded_columns' is not supported for dataset_type"
@@ -189,6 +207,27 @@ def test_invalid_data_config():
189
207
)
190
208
191
209
210
+ def test_json_type_data_config_missing_features ():
211
+ # facets in input dataset
212
+ s3_data_input_path = "s3://path/to/input.csv"
213
+ s3_output_path = "s3://path/to/output"
214
+ label_name = "Label"
215
+ headers = ["Label" , "F1" , "F2" , "F3" , "F4" , "Predicted Label" ]
216
+ with pytest .raises (
217
+ ValueError , match = "features JMESPath is required for application/json dataset_type"
218
+ ):
219
+ DataConfig (
220
+ s3_data_input_path = s3_data_input_path ,
221
+ s3_output_path = s3_output_path ,
222
+ features = None ,
223
+ label = label_name ,
224
+ headers = headers ,
225
+ dataset_type = "application/json" ,
226
+ excluded_columns = ["F4" ],
227
+ predicted_label = "Predicted Label" ,
228
+ )
229
+
230
+
192
231
def test_s3_data_distribution_type_ignorance ():
193
232
data_config = DataConfig (
194
233
s3_data_input_path = "s3://input/train.csv" ,
@@ -344,12 +383,25 @@ def test_facet_of_bias_config(facet_name, facet_values_or_threshold, expected_re
344
383
assert bias_config .get_config () == expected_config
345
384
346
385
347
- def test_model_config ():
386
+ @pytest .mark .parametrize (
387
+ ("content_type" , "accept_type" ),
388
+ [
389
+ # All the combinations of content_type and accept_type should be acceptable
390
+ ("text/csv" , "text/csv" ),
391
+ ("application/jsonlines" , "application/jsonlines" ),
392
+ ("text/csv" , "application/json" ),
393
+ ("application/jsonlines" , "application/json" ),
394
+ ("application/jsonlines" , "text/csv" ),
395
+ ("image/jpeg" , "text/csv" ),
396
+ ("image/jpg" , "text/csv" ),
397
+ ("image/png" , "text/csv" ),
398
+ ("application/x-npy" , "text/csv" ),
399
+ ],
400
+ )
401
+ def test_valid_model_config (content_type , accept_type ):
348
402
model_name = "xgboost-model"
349
403
instance_type = "ml.c5.xlarge"
350
404
instance_count = 1
351
- accept_type = "text/csv"
352
- content_type = "application/jsonlines"
353
405
custom_attributes = "c000b4f9-df62-4c85-a0bf-7c525f9104a4"
354
406
target_model = "target_model_name"
355
407
accelerator_type = "ml.eia1.medium"
0 commit comments