Eval/bugfix/optional eval inputs (#37425)

MilesHolland · web-flow · commit 5e6da8ce062a · 2024-09-23T15:31:38.000-04:00
* default evaluate column mapping

* auto column mapping in evaluate

* add * to call signatures

* account for target

* remove accidental import

* parameterize

* rename q/a to q/r

* use new exception in test
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -573,21 +573,22 @@ def _evaluate(  # pylint: disable=too-many-locals
     )
 
     trace_destination = pf_client._config.get_trace_destination()
-
     target_run = None
-
     target_generated_columns = set()
+
+    # Create default configuration for evaluators that directly maps
+    # input data names to keyword inputs of the same name in the evaluators.
+    if not evaluator_config:
+        evaluator_config = {}
+    if "default" not in evaluator_config:
+        evaluator_config["default"] = {}
+
+    # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
     if data is not None and target is not None:
         input_data_df, target_generated_columns, target_run = _apply_target_to_data(
             target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
         )
 
-        # Make sure, the default is always in the configuration.
-        if not evaluator_config:
-            evaluator_config = {}
-        if "default" not in evaluator_config:
-            evaluator_config["default"] = {}
-
         for evaluator_name, mapping in evaluator_config.items():
             mapped_to_values = set(mapping.values())
             for col in target_generated_columns:
@@ -604,6 +605,16 @@ def _evaluate(  # pylint: disable=too-many-locals
         # everything we need for evaluators.
         _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
 
+    # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
+    # via target mapping.
+    # If both the data and the output dictionary of the target function
+    # have the same column, then the target function value is used.
+    if input_data_df is not None:
+        for col in input_data_df.columns:
+            # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
+            # Also ignore columns that are already in config, since they've been covered by target mapping.
+            if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
+                evaluator_config["default"][col] = f"${{data.{col}}}"
     # Batch Run
     evaluators_info = {}
     use_pf_client = kwargs.get("_use_pf_client", True)
@@ -672,7 +683,6 @@ def _evaluate(  # pylint: disable=too-many-locals
     result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
     metrics = _aggregate_metrics(evaluators_result_df, evaluators)
     metrics.update(evaluators_metric)
-
     studio_url = _log_metrics_and_instance_results(
         metrics,
         result_df,
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/questions_answers_basic.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/questions_answers_basic.jsonl
@@ -0,0 +1,3 @@
+{"query":"How long is flight from Earth to LV-426?","response":"There is nothing good there."}
+{"query":"Why there is no central heating on the street?","response":"There is no central heating on the streets today, but it will be, I promise."}
+{"query":"Why these questions are so strange?","response":"The life is strange..."}
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -68,6 +68,10 @@ def questions_wrong_file():
 def questions_answers_file():
     return _get_file("questions_answers.jsonl")
 
+@pytest.fixture
+def questions_answers_basic_file():
+    return _get_file("questions_answers_basic.jsonl")
+
 
 def _target_fn(query):
     """An example target function."""
@@ -90,6 +94,15 @@ def _target_fn2(query):
     response["query"] = f"The query is as follows: {query}"
     return response
 
+def _new_answer_target():
+    return {"response": "new response"}
+
+def _question_override_target(query):
+    return {"query": "new query"}
+
+def _question_answer_override_target(query, response):
+    return {"query": "new query", "response": "new response"}
+
 
 @pytest.mark.usefixtures("mock_model_config")
 @pytest.mark.unittest
@@ -508,3 +521,103 @@ def test_general_aggregation(self):
         assert aggregation["thing.metric"] == 3
         assert aggregation["other_thing.other_meteric"] == -3
         assert aggregation["final_thing.final_metric"] == 0.4
+
+    @pytest.mark.parametrize("use_pf_client", [True, False])
+    def test_optional_inputs_with_data(self, questions_file, questions_answers_basic_file, use_pf_client):
+        from test_evaluators.test_inputs_evaluators import (
+            NonOptionalEval,
+            HalfOptionalEval,
+            OptionalEval,
+            NoInputEval
+        )
+
+        # All variants work with both keyworded inputs
+        results = evaluate(
+            data=questions_answers_basic_file, 
+            evaluators={
+                "non": NonOptionalEval(),
+                "half": HalfOptionalEval(),
+                "opt": OptionalEval(),
+                "no": NoInputEval()
+            },
+            _use_pf_client=use_pf_client
+        ) # type: ignore
+
+        first_row = results["rows"][0]
+        assert first_row["outputs.non.non_score"] == 0
+        assert first_row["outputs.half.half_score"] == 1
+        assert first_row["outputs.opt.opt_score"] == 3
+        # CodeClient doesn't like no-input evals.
+        if use_pf_client:
+            assert first_row["outputs.no.no_score"] == 0
+
+        # Variant with no default inputs fails on single input
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluate(
+                data=questions_file,
+                evaluators={
+                    "non": NonOptionalEval(),
+                },
+            _use_pf_client=use_pf_client
+            ) # type: ignore
+        assert exc_info._excinfo[1].__str__() == "Missing required inputs for evaluator non : ['response']." # type: ignore
+
+        # Variants with default answer work when only question is inputted
+        only_question_results = evaluate(
+            data=questions_file, 
+            evaluators={
+                "half": HalfOptionalEval(),
+                "opt": OptionalEval(),
+                "no": NoInputEval()
+            },
+            _use_pf_client=use_pf_client
+        ) # type: ignore
+
+        first_row_2 = only_question_results["rows"][0]
+        assert first_row_2["outputs.half.half_score"] == 0
+        assert first_row_2["outputs.opt.opt_score"] == 1
+        if use_pf_client:
+            assert first_row["outputs.no.no_score"] == 0
+
+    @pytest.mark.parametrize("use_pf_client", [True, False])
+    def test_optional_inputs_with_target(self, questions_file, questions_answers_basic_file, use_pf_client):
+        from test_evaluators.test_inputs_evaluators import EchoEval
+
+        # Check that target overrides default inputs
+        target_answer_results = evaluate(
+            data=questions_file,
+            target=_new_answer_target, 
+            evaluators={
+                "echo": EchoEval()
+            },
+            _use_pf_client=use_pf_client
+        ) # type: ignore
+
+        assert target_answer_results['rows'][0]['outputs.echo.echo_query'] == 'How long is flight from Earth to LV-426?'
+        assert target_answer_results['rows'][0]['outputs.echo.echo_response'] == 'new response'
+
+        # Check that target replaces inputs from data (I.E. if both data and target have same output
+        # the target output is sent to the evaluator.)
+        question_override_results = evaluate(
+            data=questions_answers_basic_file,
+            target=_question_override_target, 
+            evaluators={
+                "echo": EchoEval()
+            },
+            _use_pf_client=use_pf_client
+        ) # type: ignore
+
+        assert question_override_results['rows'][0]['outputs.echo.echo_query'] == "new query"
+        assert question_override_results['rows'][0]['outputs.echo.echo_response'] == 'There is nothing good there.'
+
+        # Check that target can replace default and data inputs at the same time.
+        double_override_results = evaluate(
+            data=questions_answers_basic_file,
+            target=_question_answer_override_target, 
+            evaluators={
+                "echo": EchoEval()
+            },
+            _use_pf_client=use_pf_client
+        ) # type: ignore
+        assert double_override_results['rows'][0]['outputs.echo.echo_query'] == "new query"
+        assert double_override_results['rows'][0]['outputs.echo.echo_response'] == "new response"
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py
@@ -0,0 +1,44 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+# A collection of very simple evaluators designed to test column mappings.
+# (aka proper data file -> _call__ input mapping)
+
+class NonOptionalEval():
+    def __init__(self):
+        pass
+
+    def __call__(self, query, response):
+        return {"non_score": 0}
+
+class HalfOptionalEval():
+    def __init__(self):
+        pass
+
+    def __call__(self, query, *, response = "default"):
+        return {"half_score": 0 if response == "default" else 1}
+
+    
+class OptionalEval():
+    def __init__(self):
+        pass
+
+    def __call__(self, *, query = "default", response = "default"):
+        return {"opt_score": (0 if query == "default" else 1) + (0 if response == "default" else 2)}
+    
+class NoInputEval():
+    def __init__(self):
+        pass
+
+    def __call__(self):
+        return {"no_score": 0}
+
+class EchoEval():
+    def __init__(self):
+        pass
+
+    def __call__(self, *, query = "default", response = "default"):
+        return {"echo_query": query, "echo_response": response}
+    
+    

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{"query":"How long is flight from Earth to LV-426?","response":"There is nothing good there."}`
	`2`	`+{"query":"Why there is no central heating on the street?","response":"There is no central heating on the streets today, but it will be, I promise."}`
	`3`	`+{"query":"Why these questions are so strange?","response":"The life is strange..."}`