Skip to content

Commit 45f482f

Browse files
authored
Remove numpy Dependency from Evaluation SDK (#37688)
* Remove numpy Dependency * changelog entry * remove numpy from shared_requirements * use list comprehension instead * fix math not defined issue * fix math import and remove numpy in tests * address some comments, fix tests
1 parent a4a228e commit 45f482f

File tree

20 files changed

+108
-86
lines changed

20 files changed

+108
-86
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
### Breaking Changes
88

9+
- Removed `numpy` dependency. All NaN values returned by the SDK have been changed to from `numpy.nan` to `math.nan`.
10+
911
### Bugs Fixed
1012

1113
### Other Changes
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
import math
6+
from typing import List
7+
8+
9+
def list_sum(lst: List[float]) -> float:
10+
return sum(lst)
11+
12+
13+
def list_mean(lst: List[float]) -> float:
14+
return list_sum(lst) / len(lst)
15+
16+
17+
def list_mean_nan_safe(lst: List[float]) -> float:
18+
return list_mean([l for l in lst if not math.isnan(l)])

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55
import importlib.metadata
66
import re
77
import time
8+
import math
89
from ast import literal_eval
910
from typing import Dict, List
1011
from urllib.parse import urlparse
1112

1213
import jwt
13-
import numpy as np
1414

1515
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
1616
from azure.ai.evaluation._http_utils import get_async_http_client
@@ -229,21 +229,21 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
229229
parsed_response = literal_eval(response)
230230
result = {}
231231
# Use label instead of score since these are assumed to be boolean results.
232-
# Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
233-
result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
232+
# Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
233+
result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
234234
result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
235235

236236
if metric_name == EvaluationMetrics.XPIA:
237237
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
238-
# if present else set them to np.nan
238+
# if present else set them to math.nan
239239
result[metric_name + "_manipulated_content"] = (
240-
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
240+
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
241241
)
242242
result[metric_name + "_intrusion"] = (
243-
parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
243+
parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
244244
)
245245
result[metric_name + "_information_gathering"] = (
246-
parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
246+
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
247247
)
248248
return result
249249
return _parse_content_harm_response(batch_response, metric_name)
@@ -265,7 +265,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
265265
if key == EvaluationMetrics.HATE_FAIRNESS:
266266
key = EvaluationMetrics.HATE_UNFAIRNESS
267267

268-
result = {key: np.nan, key + "_score": np.nan, key + "_reason": ""}
268+
result = {key: math.nan, key + "_score": math.nan, key + "_reason": ""}
269269

270270
response = batch_response[0]
271271
if metric_name not in response:
@@ -285,9 +285,9 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
285285
if "label" in harm_response:
286286
metric_value = harm_response["label"]
287287
elif "valid" in harm_response:
288-
metric_value = 0 if harm_response["valid"] else np.nan
288+
metric_value = 0 if harm_response["valid"] else math.nan
289289
else:
290-
metric_value = np.nan
290+
metric_value = math.nan
291291

292292
# get reason
293293
if "reasoning" in harm_response:
@@ -301,21 +301,21 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
301301
if metric_value_match:
302302
metric_value = int(metric_value_match[0])
303303
else:
304-
metric_value = np.nan
304+
metric_value = math.nan
305305
reason = harm_response
306306
elif harm_response != "" and isinstance(harm_response, (int, float)):
307307
if 0 < harm_response <= 7:
308308
metric_value = harm_response
309309
else:
310-
metric_value = np.nan
310+
metric_value = math.nan
311311
reason = ""
312312
else:
313-
metric_value = np.nan
313+
metric_value = math.nan
314314
reason = ""
315315

316316
harm_score = metric_value
317-
if not np.isnan(metric_value):
318-
# int(np.nan) causes a value error, and np.nan is already handled
317+
if not math.isnan(metric_value):
318+
# int(math.nan) causes a value error, and math.nan is already handled
319319
# by get_harm_severity_level
320320
harm_score = int(metric_value)
321321
result[key] = get_harm_severity_level(harm_score)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5+
import math
56
import threading
67
from typing import List, Union
78

89
import nltk
9-
import numpy as np
1010

1111
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
1212
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
@@ -30,12 +30,12 @@ def get_harm_severity_level(harm_score: int) -> str:
3030
constants.HarmSeverityLevel.Medium: [4, 5],
3131
constants.HarmSeverityLevel.High: [6, 7],
3232
}
33-
if harm_score == np.nan or harm_score is None:
34-
return np.nan
33+
if math.isnan(harm_score) or harm_score is None:
34+
return math.nan
3535
for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
3636
if harm_score_range[0] <= harm_score <= harm_score_range[1]:
3737
return harm_level.value
38-
return np.nan
38+
return math.nan
3939

4040

4141
def ensure_nltk_data_downloaded():

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
# ---------------------------------------------------------
44
import inspect
55
import logging
6+
import math
67
import os
78
from concurrent.futures import Future
89
from typing import Any, Callable, Dict, Optional, Union
910

10-
import numpy as np
1111
import pandas as pd
1212
from promptflow.client import PFClient
1313
from promptflow.entities import Run
@@ -53,7 +53,7 @@ def run(
5353
def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
5454
run: Run = proxy_run.run.result()
5555
result_df = self._pf_client.get_details(run, all_results=all_results)
56-
result_df.replace("(Failed)", np.nan, inplace=True)
56+
result_df.replace("(Failed)", math.nan, inplace=True)
5757
return result_df
5858

5959
def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
import re
77
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
88

9-
import numpy as np
109
import pandas as pd
1110
from promptflow._sdk._constants import LINE_NUMBER
1211
from promptflow.client import PFClient
1312

1413
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14+
from azure.ai.evaluation._common.math import list_sum
1515

1616
from .._constants import (
1717
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
@@ -73,7 +73,7 @@ def _aggregate_content_safety_metrics(
7373
defect_rate_name = col.replace("_score", "_defect_rate")
7474
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
7575
defect_rates[defect_rate_name] = round(
76-
np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
76+
list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
7777
/ col_with_numeric_values.count(),
7878
2,
7979
)
@@ -107,7 +107,7 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
107107
defect_rate_name = col.replace("_label", "_defect_rate")
108108
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
109109
defect_rates[defect_rate_name] = round(
110-
np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
110+
list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
111111
2,
112112
)
113113
return label_cols, defect_rates

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77

88
from abc import ABC
99

10-
import numpy as np
1110
from promptflow._utils.async_utils import async_run_allowing_running_loop
1211

1312
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
13+
from azure.ai.evaluation._common.math import list_mean
1414

1515

1616
# TODO exception target pass down?
@@ -241,7 +241,7 @@ def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict:
241241
# Find and average all numeric values
242242
for metric, values in evaluation_per_turn.items():
243243
if all(isinstance(value, (int, float)) for value in values):
244-
aggregated[metric] = np.mean(values)
244+
aggregated[metric] = list_mean(values)
245245
# Slap the per-turn results back in.
246246
aggregated["evaluation_per_turn"] = evaluation_per_turn
247247

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,12 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5+
import math
56
import re
67
from typing import Dict
78

8-
from typing_extensions import override
9-
10-
11-
import numpy as np
12-
139
from promptflow.core import AsyncPrompty
10+
from typing_extensions import override
1411

1512
from ..._common.utils import construct_prompty_model_config
1613

@@ -71,7 +68,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict:
7168
"""
7269
llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
7370

74-
score = np.nan
71+
score = math.nan
7572
if llm_output:
7673
match = re.search(r"\d", llm_output)
7774
if match:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44
import logging
5+
import math
56
from concurrent.futures import as_completed
67
from typing import Dict, List
78

8-
import numpy as np
99
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
1010

11+
from azure.ai.evaluation._common.math import list_mean_nan_safe
1112
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
1213

1314
try:
@@ -198,7 +199,7 @@ def _aggregate_results(self, per_turn_results: List[Dict]):
198199
score_key = f"{metric}_score"
199200
reason_key = f"{metric}_reason"
200201

201-
aggregated_score = np.nanmean(scores[score_key])
202+
aggregated_score = list_mean_nan_safe(scores[score_key])
202203
aggregated[metric] = self._get_harm_severity_level(aggregated_score)
203204
aggregated[score_key] = aggregated_score
204205

@@ -291,11 +292,11 @@ def _get_harm_severity_level(self, harm_score: float) -> str:
291292
"High": [6, 7],
292293
}
293294

294-
if harm_score == np.nan or harm_score is None:
295-
return np.nan
295+
if math.isnan(harm_score) or harm_score is None:
296+
return math.nan
296297

297298
for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
298299
if harm_score_range[0] <= harm_score <= harm_score_range[1]:
299300
return harm_level
300301

301-
return np.nan
302+
return math.nan

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44

55
import json
66
import logging
7+
import math
78
import os
89
import re
910

10-
import numpy as np
1111
from promptflow._utils.async_utils import async_run_allowing_running_loop
1212
from promptflow.core import AsyncPrompty
1313

14-
14+
from ..._common.math import list_mean_nan_safe
1515
from ..._common.utils import construct_prompty_model_config
1616

1717
logger = logging.getLogger(__name__)
@@ -69,7 +69,7 @@ async def __call__(self, *, conversation, **kwargs):
6969
llm_output = await self._flow(
7070
query=query, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
7171
)
72-
score = np.nan
72+
score = math.nan
7373
if llm_output:
7474
parsed_score_response = re.findall(r"\d+", llm_output.split("# Result")[-1].strip())
7575
if len(parsed_score_response) > 0:
@@ -82,10 +82,10 @@ async def __call__(self, *, conversation, **kwargs):
8282
"Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
8383
)
8484

85-
per_turn_scores.append(np.nan)
85+
per_turn_scores.append(math.nan)
8686

8787
return {
88-
"gpt_retrieval": np.nanmean(per_turn_scores),
88+
"gpt_retrieval": list_mean_nan_safe(per_turn_scores),
8989
"evaluation_per_turn": {
9090
"gpt_retrieval": {
9191
"score": per_turn_scores,

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5+
import math
56
import os
67
import re
78

8-
import numpy as np
99
from promptflow._utils.async_utils import async_run_allowing_running_loop
1010
from promptflow.core import AsyncPrompty
1111

@@ -57,7 +57,7 @@ async def __call__(self, *, query: str, response: str, ground_truth: str, **kwar
5757
query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
5858
)
5959

60-
score = np.nan
60+
score = math.nan
6161
if llm_output:
6262
match = re.search(r"\d", llm_output)
6363
if match:

sdk/evaluation/azure-ai-evaluation/setup.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,6 @@
6767
install_requires=[
6868
"promptflow-devkit>=1.15.0",
6969
"promptflow-core>=1.15.0",
70-
"numpy>=1.23.2; python_version<'3.12'",
71-
"numpy>=1.26.4; python_version>='3.12'",
7270
"pyjwt>=2.8.0",
7371
"azure-identity>=1.12.0",
7472
"azure-core>=1.30.2",

sdk/evaluation/azure-ai-evaluation/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
from typing import List
22

3-
import numpy as np
3+
4+
def median(lst: List[str]) -> float:
5+
lst.sort()
6+
length = len(lst)
7+
if length % 2 == 1:
8+
return lst[length // 2]
9+
else:
10+
return (lst[length // 2 - 1] + lst[length // 2]) / 2
411

512

613
class AnswerLength:
@@ -12,5 +19,5 @@ def __call__(self, response: str, **kwargs):
1219
return {"length": len(response)} if self.return_json else len(response)
1320

1421
def __aggregate__(self, line_results: List[str]) -> dict:
15-
median_value = np.median([v.length for v in line_results]) if self.return_json else np.median(line_results)
22+
median_value = median([v.length for v in line_results]) if self.return_json else median(line_results)
1623
return {"median": median_value} if self.aggregate_return_json else median_value

0 commit comments

Comments
 (0)