Skip to content

Commit b661c88

Browse files
authored
AOAI results nits: fix order and _result column (#40897)
* fix order and _result column * more nits
1 parent f24c334 commit b661c88

File tree

2 files changed

+14
-6
lines changed

2 files changed

+14
-6
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ def __init__(
6868
"rouge_3",
6969
"rouge_4",
7070
"rouge_5",
71-
"rouge_l",
7271
"cosine",
7372
],
7473
input: str,

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def _get_single_run_results(
208208
if run_results.status != "completed":
209209
raise EvaluationException(
210210
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
211-
+ " failed with status {run_results.status}.",
211+
+ f" failed with status {run_results.status}.",
212212
blame=ErrorBlame.UNKNOWN,
213213
category=ErrorCategory.FAILED_EXECUTION,
214214
target=ErrorTarget.AOAI_GRADER,
@@ -240,8 +240,12 @@ def _get_single_run_results(
240240
eval_id=run_info["eval_group_id"],
241241
run_id=run_info["eval_run_id"]
242242
)
243-
listed_results = {}
243+
listed_results = {"index": []}
244+
# raw data has no order guarantees, we need to sort them by their
245+
# datasource_item_id
244246
for row_result in raw_list_results.data:
247+
# Add the datasource_item_id for later sorting
248+
listed_results["index"].append(row_result.datasource_item_id)
245249
for single_grader_row_result in row_result.results:
246250
grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
247251
for name, value in single_grader_row_result.items():
@@ -251,14 +255,19 @@ def _get_single_run_results(
251255
# create a `_result` column for each grader
252256
result_column_name = f"outputs.{grader_name}.{grader_name}_result"
253257
if len(result_column_name) < 50: #TODO: is this the limit? Should we keep "passed"?
254-
listed_results[result_column_name] = EVALUATION_PASS_FAIL_MAPPING[value]
258+
if (result_column_name not in listed_results):
259+
listed_results[result_column_name] = []
260+
listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
255261

256262
formatted_column_name = f"outputs.{grader_name}.{name}"
257263
if (formatted_column_name not in listed_results):
258264
listed_results[formatted_column_name] = []
259-
listed_results[f"outputs.{grader_name}.{name}"].append(value)
265+
listed_results[formatted_column_name].append(value)
260266
output_df = pd.DataFrame(listed_results)
261-
267+
# sort by index
268+
output_df = output_df.sort_values('index', ascending=[True])
269+
# remove index column
270+
output_df.drop(columns=["index"], inplace=True)
262271
return output_df, run_metrics
263272

264273

0 commit comments

Comments
 (0)