@@ -42,39 +42,57 @@ def test_uri():
42
42
assert "306415355426.dkr.ecr.us-west-2.amazonaws.com/sagemaker-clarify-processing:1.0" == uri
43
43
44
44
45
- def test_data_config ():
45
+ @pytest .mark .parametrize (
46
+ ("dataset_type" , "features" , "excluded_columns" , "predicted_label" ),
47
+ [
48
+ ("text/csv" , None , ["F4" ], "Predicted Label" ),
49
+ ("application/jsonlines" , None , ["F4" ], "Predicted Label" ),
50
+ ("application/json" , "[*].[F1,F2,F3]" , ["F4" ], "Predicted Label" ),
51
+ ("application/x-parquet" , None , ["F4" ], "Predicted Label" ),
52
+ ],
53
+ )
54
+ def test_data_config (dataset_type , features , excluded_columns , predicted_label ):
46
55
# facets in input dataset
47
56
s3_data_input_path = "s3://path/to/input.csv"
48
57
s3_output_path = "s3://path/to/output"
49
58
label_name = "Label"
50
- headers = [
51
- "Label" ,
52
- "F1" ,
53
- "F2" ,
54
- "F3" ,
55
- "F4" ,
56
- ]
57
- dataset_type = "text/csv"
59
+ headers = ["Label" , "F1" , "F2" , "F3" , "F4" , "Predicted Label" ]
58
60
data_config = DataConfig (
59
61
s3_data_input_path = s3_data_input_path ,
60
62
s3_output_path = s3_output_path ,
63
+ features = features ,
61
64
label = label_name ,
62
65
headers = headers ,
63
66
dataset_type = dataset_type ,
67
+ excluded_columns = excluded_columns ,
68
+ predicted_label = predicted_label ,
64
69
)
65
70
66
71
expected_config = {
67
- "dataset_type" : "text/csv" ,
72
+ "dataset_type" : dataset_type ,
68
73
"headers" : headers ,
69
74
"label" : "Label" ,
70
75
}
76
+ if features :
77
+ expected_config ["features" ] = features
78
+ if excluded_columns :
79
+ expected_config ["excluded_columns" ] = excluded_columns
80
+ if predicted_label :
81
+ expected_config ["predicted_label" ] = predicted_label
71
82
72
83
assert expected_config == data_config .get_config ()
73
84
assert s3_data_input_path == data_config .s3_data_input_path
74
85
assert s3_output_path == data_config .s3_output_path
75
86
assert "None" == data_config .s3_compression_type
76
87
assert "FullyReplicated" == data_config .s3_data_distribution_type
77
88
89
+
90
+ def test_data_config_with_separate_facet_dataset ():
91
+ s3_data_input_path = "s3://path/to/input.csv"
92
+ s3_output_path = "s3://path/to/output"
93
+ label_name = "Label"
94
+ headers = ["Label" , "F1" , "F2" , "F3" , "F4" ]
95
+
78
96
# facets NOT in input dataset
79
97
joinsource = 5
80
98
facet_dataset_uri = "s3://path/to/facet.csv"
@@ -89,7 +107,7 @@ def test_data_config():
89
107
s3_output_path = s3_output_path ,
90
108
label = label_name ,
91
109
headers = headers ,
92
- dataset_type = dataset_type ,
110
+ dataset_type = "text/csv" ,
93
111
joinsource = joinsource ,
94
112
facet_dataset_uri = facet_dataset_uri ,
95
113
facet_headers = facet_headers ,
@@ -126,7 +144,7 @@ def test_data_config():
126
144
s3_output_path = s3_output_path ,
127
145
label = label_name ,
128
146
headers = headers ,
129
- dataset_type = dataset_type ,
147
+ dataset_type = "text/csv" ,
130
148
joinsource = joinsource ,
131
149
excluded_columns = excluded_columns ,
132
150
)
@@ -158,7 +176,7 @@ def test_invalid_data_config():
158
176
DataConfig (
159
177
s3_data_input_path = "s3://bucket/inputpath" ,
160
178
s3_output_path = "s3://bucket/outputpath" ,
161
- dataset_type = "application/x-parquet " ,
179
+ dataset_type = "application/x-image " ,
162
180
predicted_label = "label" ,
163
181
)
164
182
error_msg = r"^The parameter 'excluded_columns' is not supported for dataset_type"
@@ -189,6 +207,28 @@ def test_invalid_data_config():
189
207
)
190
208
191
209
210
+ # features JMESPath is required for JSON dataset types
211
+ def test_json_type_data_config_missing_features ():
212
+ # facets in input dataset
213
+ s3_data_input_path = "s3://path/to/input.csv"
214
+ s3_output_path = "s3://path/to/output"
215
+ label_name = "Label"
216
+ headers = ["Label" , "F1" , "F2" , "F3" , "F4" , "Predicted Label" ]
217
+ with pytest .raises (
218
+ ValueError , match = "features JMESPath is required for application/json dataset_type"
219
+ ):
220
+ DataConfig (
221
+ s3_data_input_path = s3_data_input_path ,
222
+ s3_output_path = s3_output_path ,
223
+ features = None ,
224
+ label = label_name ,
225
+ headers = headers ,
226
+ dataset_type = "application/json" ,
227
+ excluded_columns = ["F4" ],
228
+ predicted_label = "Predicted Label" ,
229
+ )
230
+
231
+
192
232
def test_s3_data_distribution_type_ignorance ():
193
233
data_config = DataConfig (
194
234
s3_data_input_path = "s3://input/train.csv" ,
@@ -344,12 +384,25 @@ def test_facet_of_bias_config(facet_name, facet_values_or_threshold, expected_re
344
384
assert bias_config .get_config () == expected_config
345
385
346
386
347
- def test_model_config ():
387
+ @pytest .mark .parametrize (
388
+ ("content_type" , "accept_type" ),
389
+ [
390
+ # All the combinations of content_type and accept_type should be acceptable
391
+ ("text/csv" , "text/csv" ),
392
+ ("application/jsonlines" , "application/jsonlines" ),
393
+ ("text/csv" , "application/json" ),
394
+ ("application/jsonlines" , "application/json" ),
395
+ ("application/jsonlines" , "text/csv" ),
396
+ ("image/jpeg" , "text/csv" ),
397
+ ("image/jpg" , "text/csv" ),
398
+ ("image/png" , "text/csv" ),
399
+ ("application/x-npy" , "text/csv" ),
400
+ ],
401
+ )
402
+ def test_valid_model_config (content_type , accept_type ):
348
403
model_name = "xgboost-model"
349
404
instance_type = "ml.c5.xlarge"
350
405
instance_count = 1
351
- accept_type = "text/csv"
352
- content_type = "application/jsonlines"
353
406
custom_attributes = "c000b4f9-df62-4c85-a0bf-7c525f9104a4"
354
407
target_model = "target_model_name"
355
408
accelerator_type = "ml.eia1.medium"
0 commit comments