fix: support specifying a facet by its column index

xgchena · BasilBeirouti · commit 64e46906be1c · 2022-01-19T13:52:22.000-08:00
Currently the Clarify BiasConfig only accepts facet name. Actually
Clarify analysis configuration supports both name and index. This
commit adds the same support to BiasConfig.
diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
@@ -111,33 +111,58 @@ def __init__(
         """Initializes a configuration of the sensitive groups in the dataset.
 
         Args:
-            label_values_or_threshold (Any): List of label values or threshold to indicate positive
-                outcome used for bias metrics.
-            facet_name (str or [str]): String or List of strings of sensitive attribute(s) in the
-            input data for which we like to compare metrics.
-            facet_values_or_threshold (list): Optional list of values to form a sensitive group or
-                threshold for a numeric facet column that defines the lower bound of a sensitive
-                group. Defaults to considering each possible value as sensitive group and
-                computing metrics vs all the other examples.
-                If facet_name is a list, this needs to be None or a List consisting of lists or None
-                with the same length as facet_name list.
+            label_values_or_threshold ([int or float or str]): List of label value(s) or threshold
+                to indicate positive outcome used for bias metrics. Dependency on the problem type,
+
+                * Binary problem: The list shall include one positive value.
+                * Categorical problem: The list shall include one or more (but not all) categories
+                  which are the positive values.
+                * Regression problem: The list shall include one threshold that defines the lower
+                  bound of positive values.
+
+            facet_name (str or int or [str] or [int]): Sensitive attribute column name (or index in
+                the input data) for which you like to compute bias metrics. It can also be a list
+                of names (or indexes) if you like to compute for multiple sensitive attributes.
+            facet_values_or_threshold ([int or float or str] or [[int or float or str]]):
+                The parameter indicates the sensitive group. If facet_name is a scalar, then it can
+                be None or a list. Depending on the data type of the facet column,
+
+                * Binary: None means computing the bias metrics for each binary value. Or add one
+                  binary value to the list, to compute its bias metrics only.
+                * Categorical: None means computing the bias metrics for each category. Or add one
+                  or more (but not all) categories to the list, to compute their bias metrics v.s.
+                  the other categories.
+                * Continuous: The list shall include one and only one threshold which defines the
+                  lower bound of a sensitive group.
+
+                If facet_name is a list, then it can be None if all facets are of binary type or
+                categorical type. Otherwise it shall be a list, and each element is the values or
+                threshold of the corresponding facet.
             group_name (str): Optional column name or index to indicate a group column to be used
                 for the bias metric 'Conditional Demographic Disparity in Labels - CDDL' or
                 'Conditional Demographic Disparity in Predicted Labels - CDDPL'.
         """
-        if isinstance(facet_name, str):
+        if isinstance(facet_name, list):
+            assert len(facet_name) > 0, "Please provide at least one facet"
+            if facet_values_or_threshold is None:
+                facet_list = [
+                    {"name_or_index": single_facet_name} for single_facet_name in facet_name
+                ]
+            elif len(facet_values_or_threshold) == len(facet_name):
+                facet_list = []
+                for i, single_facet_name in enumerate(facet_name):
+                    facet = {"name_or_index": single_facet_name}
+                    if facet_values_or_threshold is not None:
+                        _set(facet_values_or_threshold[i], "value_or_threshold", facet)
+                    facet_list.append(facet)
+            else:
+                raise ValueError(
+                    "The number of facet names doesn't match the number of facet values"
+                )
+        else:
             facet = {"name_or_index": facet_name}
             _set(facet_values_or_threshold, "value_or_threshold", facet)
             facet_list = [facet]
-        elif facet_values_or_threshold is None or len(facet_name) == len(facet_values_or_threshold):
-            facet_list = []
-            for i, single_facet_name in enumerate(facet_name):
-                facet = {"name_or_index": single_facet_name}
-                if facet_values_or_threshold is not None:
-                    _set(facet_values_or_threshold[i], "value_or_threshold", facet)
-                facet_list.append(facet)
-        else:
-            raise ValueError("Wrong combination of argument values passed")
         self.analysis_config = {
             "label_values_or_threshold": label_values_or_threshold,
             "facet": facet_list,
diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py
@@ -82,7 +82,7 @@ def test_invalid_data_config():
         )
 
 
-def test_data_bias_config():
+def test_bias_config():
     label_values = [1]
     facet_name = "F1"
     facet_threshold = 0.3
@@ -103,52 +103,122 @@ def test_data_bias_config():
     assert expected_config == data_bias_config.get_config()
 
 
-def test_data_bias_config_multi_facet():
-    label_values = [1]
-    facet_name = ["Facet1", "Facet2"]
-    facet_threshold = [[0], [1, 2]]
-    group_name = "A151"
-
-    data_bias_config = BiasConfig(
-        label_values_or_threshold=label_values,
-        facet_name=facet_name,
-        facet_values_or_threshold=facet_threshold,
-        group_name=group_name,
-    )
+def test_invalid_bias_config():
+    # Empty facet list,
+    with pytest.raises(AssertionError, match="Please provide at least one facet"):
+        BiasConfig(
+            label_values_or_threshold=[1],
+            facet_name=[],
+        )
 
-    expected_config = {
-        "label_values_or_threshold": label_values,
-        "facet": [
-            {"name_or_index": facet_name[0], "value_or_threshold": facet_threshold[0]},
-            {"name_or_index": facet_name[1], "value_or_threshold": facet_threshold[1]},
-        ],
-        "group_variable": group_name,
-    }
-    assert expected_config == data_bias_config.get_config()
+    # Two facets but only one value
+    with pytest.raises(
+        ValueError, match="The number of facet names doesn't match the number of facet values"
+    ):
+        BiasConfig(
+            label_values_or_threshold=[1],
+            facet_name=["Feature1", "Feature2"],
+            facet_values_or_threshold=[[1]],
+        )
 
 
-def test_data_bias_config_multi_facet_not_all_with_value():
+@pytest.mark.parametrize(
+    "facet_name,facet_values_or_threshold,expected_result",
+    [
+        # One facet, assume that it is binary and value 1 indicates the sensitive group
+        [
+            "Feature1",
+            [1],
+            {
+                "facet": [{"name_or_index": "Feature1", "value_or_threshold": [1]}],
+            },
+        ],
+        # The same facet as above, facet value is not specified. (Clarify will compute bias metrics
+        # for each binary value).
+        [
+            "Feature1",
+            None,
+            {
+                "facet": [{"name_or_index": "Feature1"}],
+            },
+        ],
+        # Assume that the 2nd column (index 1, zero-based) of the dataset as facet, it has
+        # four categories and two of them indicate the sensitive group.
+        [
+            1,
+            ["category1, category2"],
+            {
+                "facet": [{"name_or_index": 1, "value_or_threshold": ["category1, category2"]}],
+            },
+        ],
+        # The same facet as above, facet values are not specified. (Clarify will iterate
+        # the categories and compute bias metrics for each category).
+        [
+            1,
+            None,
+            {
+                "facet": [{"name_or_index": 1}],
+            },
+        ],
+        # Assume that the facet is numeric value in range [0.0, 1.0]. Given facet threshold 0.5,
+        # interval (0.5, 1.0] indicates the sensitive group.
+        [
+            "Feature3",
+            [0.5],
+            {
+                "facet": [{"name_or_index": "Feature3", "value_or_threshold": [0.5]}],
+            },
+        ],
+        # Multiple facets
+        [
+            ["Feature1", 1, "Feature3"],
+            [[1], ["category1, category2"], [0.5]],
+            {
+                "facet": [
+                    {"name_or_index": "Feature1", "value_or_threshold": [1]},
+                    {"name_or_index": 1, "value_or_threshold": ["category1, category2"]},
+                    {"name_or_index": "Feature3", "value_or_threshold": [0.5]},
+                ],
+            },
+        ],
+        # Multiple facets, no value or threshold
+        [
+            ["Feature1", 1, "Feature3"],
+            None,
+            {
+                "facet": [
+                    {"name_or_index": "Feature1"},
+                    {"name_or_index": 1},
+                    {"name_or_index": "Feature3"},
+                ],
+            },
+        ],
+        # Multiple facets, specify values or threshold for some of them
+        [
+            ["Feature1", 1, "Feature3"],
+            [[1], None, [0.5]],
+            {
+                "facet": [
+                    {"name_or_index": "Feature1", "value_or_threshold": [1]},
+                    {"name_or_index": 1},
+                    {"name_or_index": "Feature3", "value_or_threshold": [0.5]},
+                ],
+            },
+        ],
+    ],
+)
+def test_facet_of_bias_config(facet_name, facet_values_or_threshold, expected_result):
     label_values = [1]
-    facet_name = ["Facet1", "Facet2"]
-    facet_threshold = [[0], None]
-    group_name = "A151"
-
-    data_bias_config = BiasConfig(
+    bias_config = BiasConfig(
         label_values_or_threshold=label_values,
         facet_name=facet_name,
-        facet_values_or_threshold=facet_threshold,
-        group_name=group_name,
+        facet_values_or_threshold=facet_values_or_threshold,
     )
-
     expected_config = {
         "label_values_or_threshold": label_values,
-        "facet": [
-            {"name_or_index": facet_name[0], "value_or_threshold": facet_threshold[0]},
-            {"name_or_index": facet_name[1]},
-        ],
-        "group_variable": group_name,
+        **expected_result,
     }
-    assert expected_config == data_bias_config.get_config()
+    assert bias_config.get_config() == expected_config
 
 
 def test_model_config():