Skip to content

Commit 5e6da8c

Browse files
authored
Eval/bugfix/optional eval inputs (#37425)
* default evaluate column mapping * auto column mapping in evaluate * add * to call signatures * account for target * remove accidental import * parameterize * rename q/a to q/r * use new exception in test
1 parent 68fc715 commit 5e6da8c

File tree

4 files changed

+179
-9
lines changed

4 files changed

+179
-9
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -573,21 +573,22 @@ def _evaluate( # pylint: disable=too-many-locals
573573
)
574574

575575
trace_destination = pf_client._config.get_trace_destination()
576-
577576
target_run = None
578-
579577
target_generated_columns = set()
578+
579+
# Create default configuration for evaluators that directly maps
580+
# input data names to keyword inputs of the same name in the evaluators.
581+
if not evaluator_config:
582+
evaluator_config = {}
583+
if "default" not in evaluator_config:
584+
evaluator_config["default"] = {}
585+
586+
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
580587
if data is not None and target is not None:
581588
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
582589
target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
583590
)
584591

585-
# Make sure, the default is always in the configuration.
586-
if not evaluator_config:
587-
evaluator_config = {}
588-
if "default" not in evaluator_config:
589-
evaluator_config["default"] = {}
590-
591592
for evaluator_name, mapping in evaluator_config.items():
592593
mapped_to_values = set(mapping.values())
593594
for col in target_generated_columns:
@@ -604,6 +605,16 @@ def _evaluate( # pylint: disable=too-many-locals
604605
# everything we need for evaluators.
605606
_validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
606607

608+
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
609+
# via target mapping.
610+
# If both the data and the output dictionary of the target function
611+
# have the same column, then the target function value is used.
612+
if input_data_df is not None:
613+
for col in input_data_df.columns:
614+
# Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
615+
# Also ignore columns that are already in config, since they've been covered by target mapping.
616+
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
617+
evaluator_config["default"][col] = f"${{data.{col}}}"
607618
# Batch Run
608619
evaluators_info = {}
609620
use_pf_client = kwargs.get("_use_pf_client", True)
@@ -672,7 +683,6 @@ def _evaluate( # pylint: disable=too-many-locals
672683
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
673684
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
674685
metrics.update(evaluators_metric)
675-
676686
studio_url = _log_metrics_and_instance_results(
677687
metrics,
678688
result_df,
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"query":"How long is flight from Earth to LV-426?","response":"There is nothing good there."}
2+
{"query":"Why there is no central heating on the street?","response":"There is no central heating on the streets today, but it will be, I promise."}
3+
{"query":"Why these questions are so strange?","response":"The life is strange..."}

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ def questions_wrong_file():
6868
def questions_answers_file():
6969
return _get_file("questions_answers.jsonl")
7070

71+
@pytest.fixture
72+
def questions_answers_basic_file():
73+
return _get_file("questions_answers_basic.jsonl")
74+
7175

7276
def _target_fn(query):
7377
"""An example target function."""
@@ -90,6 +94,15 @@ def _target_fn2(query):
9094
response["query"] = f"The query is as follows: {query}"
9195
return response
9296

97+
def _new_answer_target():
98+
return {"response": "new response"}
99+
100+
def _question_override_target(query):
101+
return {"query": "new query"}
102+
103+
def _question_answer_override_target(query, response):
104+
return {"query": "new query", "response": "new response"}
105+
93106

94107
@pytest.mark.usefixtures("mock_model_config")
95108
@pytest.mark.unittest
@@ -508,3 +521,103 @@ def test_general_aggregation(self):
508521
assert aggregation["thing.metric"] == 3
509522
assert aggregation["other_thing.other_meteric"] == -3
510523
assert aggregation["final_thing.final_metric"] == 0.4
524+
525+
@pytest.mark.parametrize("use_pf_client", [True, False])
526+
def test_optional_inputs_with_data(self, questions_file, questions_answers_basic_file, use_pf_client):
527+
from test_evaluators.test_inputs_evaluators import (
528+
NonOptionalEval,
529+
HalfOptionalEval,
530+
OptionalEval,
531+
NoInputEval
532+
)
533+
534+
# All variants work with both keyworded inputs
535+
results = evaluate(
536+
data=questions_answers_basic_file,
537+
evaluators={
538+
"non": NonOptionalEval(),
539+
"half": HalfOptionalEval(),
540+
"opt": OptionalEval(),
541+
"no": NoInputEval()
542+
},
543+
_use_pf_client=use_pf_client
544+
) # type: ignore
545+
546+
first_row = results["rows"][0]
547+
assert first_row["outputs.non.non_score"] == 0
548+
assert first_row["outputs.half.half_score"] == 1
549+
assert first_row["outputs.opt.opt_score"] == 3
550+
# CodeClient doesn't like no-input evals.
551+
if use_pf_client:
552+
assert first_row["outputs.no.no_score"] == 0
553+
554+
# Variant with no default inputs fails on single input
555+
with pytest.raises(EvaluationException) as exc_info:
556+
evaluate(
557+
data=questions_file,
558+
evaluators={
559+
"non": NonOptionalEval(),
560+
},
561+
_use_pf_client=use_pf_client
562+
) # type: ignore
563+
assert exc_info._excinfo[1].__str__() == "Missing required inputs for evaluator non : ['response']." # type: ignore
564+
565+
# Variants with default answer work when only question is inputted
566+
only_question_results = evaluate(
567+
data=questions_file,
568+
evaluators={
569+
"half": HalfOptionalEval(),
570+
"opt": OptionalEval(),
571+
"no": NoInputEval()
572+
},
573+
_use_pf_client=use_pf_client
574+
) # type: ignore
575+
576+
first_row_2 = only_question_results["rows"][0]
577+
assert first_row_2["outputs.half.half_score"] == 0
578+
assert first_row_2["outputs.opt.opt_score"] == 1
579+
if use_pf_client:
580+
assert first_row["outputs.no.no_score"] == 0
581+
582+
@pytest.mark.parametrize("use_pf_client", [True, False])
583+
def test_optional_inputs_with_target(self, questions_file, questions_answers_basic_file, use_pf_client):
584+
from test_evaluators.test_inputs_evaluators import EchoEval
585+
586+
# Check that target overrides default inputs
587+
target_answer_results = evaluate(
588+
data=questions_file,
589+
target=_new_answer_target,
590+
evaluators={
591+
"echo": EchoEval()
592+
},
593+
_use_pf_client=use_pf_client
594+
) # type: ignore
595+
596+
assert target_answer_results['rows'][0]['outputs.echo.echo_query'] == 'How long is flight from Earth to LV-426?'
597+
assert target_answer_results['rows'][0]['outputs.echo.echo_response'] == 'new response'
598+
599+
# Check that target replaces inputs from data (I.E. if both data and target have same output
600+
# the target output is sent to the evaluator.)
601+
question_override_results = evaluate(
602+
data=questions_answers_basic_file,
603+
target=_question_override_target,
604+
evaluators={
605+
"echo": EchoEval()
606+
},
607+
_use_pf_client=use_pf_client
608+
) # type: ignore
609+
610+
assert question_override_results['rows'][0]['outputs.echo.echo_query'] == "new query"
611+
assert question_override_results['rows'][0]['outputs.echo.echo_response'] == 'There is nothing good there.'
612+
613+
# Check that target can replace default and data inputs at the same time.
614+
double_override_results = evaluate(
615+
data=questions_answers_basic_file,
616+
target=_question_answer_override_target,
617+
evaluators={
618+
"echo": EchoEval()
619+
},
620+
_use_pf_client=use_pf_client
621+
) # type: ignore
622+
assert double_override_results['rows'][0]['outputs.echo.echo_query'] == "new query"
623+
assert double_override_results['rows'][0]['outputs.echo.echo_response'] == "new response"
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
# A collection of very simple evaluators designed to test column mappings.
6+
# (aka proper data file -> _call__ input mapping)
7+
8+
class NonOptionalEval():
9+
def __init__(self):
10+
pass
11+
12+
def __call__(self, query, response):
13+
return {"non_score": 0}
14+
15+
class HalfOptionalEval():
16+
def __init__(self):
17+
pass
18+
19+
def __call__(self, query, *, response = "default"):
20+
return {"half_score": 0 if response == "default" else 1}
21+
22+
23+
class OptionalEval():
24+
def __init__(self):
25+
pass
26+
27+
def __call__(self, *, query = "default", response = "default"):
28+
return {"opt_score": (0 if query == "default" else 1) + (0 if response == "default" else 2)}
29+
30+
class NoInputEval():
31+
def __init__(self):
32+
pass
33+
34+
def __call__(self):
35+
return {"no_score": 0}
36+
37+
class EchoEval():
38+
def __init__(self):
39+
pass
40+
41+
def __call__(self, *, query = "default", response = "default"):
42+
return {"echo_query": query, "echo_response": response}
43+
44+

0 commit comments

Comments
 (0)