@@ -208,7 +208,7 @@ def _get_single_run_results(
208
208
if run_results .status != "completed" :
209
209
raise EvaluationException (
210
210
message = f"AOAI evaluation run { run_info ['eval_group_id' ]} /{ run_info ['eval_run_id' ]} "
211
- + " failed with status {run_results.status}." ,
211
+ + f " failed with status { run_results .status } ." ,
212
212
blame = ErrorBlame .UNKNOWN ,
213
213
category = ErrorCategory .FAILED_EXECUTION ,
214
214
target = ErrorTarget .AOAI_GRADER ,
@@ -240,8 +240,12 @@ def _get_single_run_results(
240
240
eval_id = run_info ["eval_group_id" ],
241
241
run_id = run_info ["eval_run_id" ]
242
242
)
243
- listed_results = {}
243
+ listed_results = {"index" : []}
244
+ # raw data has no order guarantees, we need to sort them by their
245
+ # datasource_item_id
244
246
for row_result in raw_list_results .data :
247
+ # Add the datasource_item_id for later sorting
248
+ listed_results ["index" ].append (row_result .datasource_item_id )
245
249
for single_grader_row_result in row_result .results :
246
250
grader_name = run_info ["grader_name_map" ][single_grader_row_result ["name" ]]
247
251
for name , value in single_grader_row_result .items ():
@@ -251,14 +255,19 @@ def _get_single_run_results(
251
255
# create a `_result` column for each grader
252
256
result_column_name = f"outputs.{ grader_name } .{ grader_name } _result"
253
257
if len (result_column_name ) < 50 : #TODO: is this the limit? Should we keep "passed"?
254
- listed_results [result_column_name ] = EVALUATION_PASS_FAIL_MAPPING [value ]
258
+ if (result_column_name not in listed_results ):
259
+ listed_results [result_column_name ] = []
260
+ listed_results [result_column_name ].append (EVALUATION_PASS_FAIL_MAPPING [value ])
255
261
256
262
formatted_column_name = f"outputs.{ grader_name } .{ name } "
257
263
if (formatted_column_name not in listed_results ):
258
264
listed_results [formatted_column_name ] = []
259
- listed_results [f"outputs. { grader_name } . { name } " ].append (value )
265
+ listed_results [formatted_column_name ].append (value )
260
266
output_df = pd .DataFrame (listed_results )
261
-
267
+ # sort by index
268
+ output_df = output_df .sort_values ('index' , ascending = [True ])
269
+ # remove index column
270
+ output_df .drop (columns = ["index" ], inplace = True )
262
271
return output_df , run_metrics
263
272
264
273
0 commit comments