Azure
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Lines changed: 2 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/math.py
Lines changed: 18 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/math.py
Lines changed: 18 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py
Lines changed: 15 additions & 15 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py
Lines changed: 15 additions & 15 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
Lines changed: 4 additions & 4 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py
Lines changed: 2 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
Lines changed: 3 additions & 3 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
Lines changed: 2 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
Lines changed: 3 additions & 6 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
Lines changed: 3 additions & 6 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py
Lines changed: 6 additions & 5 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
Lines changed: 5 additions & 5 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py
Lines changed: 2 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/setup.py
Lines changed: 0 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/setup.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py
Lines changed: 9 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py
Lines changed: 9 additions & 2 deletions
@@ -6,6 +6,8 @@
 
 ### Breaking Changes
 
+- Removed `numpy` dependency. All NaN values returned by the SDK have been changed to from `numpy.nan` to `math.nan`.
+
 ### Bugs Fixed
 
 ### Other Changes
 
@@ -0,0 +1,18 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+import math
+from typing import List
+
+
+def list_sum(lst: List[float]) -> float:
+    return sum(lst)
+
+
+def list_mean(lst: List[float]) -> float:
+    return list_sum(lst) / len(lst)
+
+
+def list_mean_nan_safe(lst: List[float]) -> float:
+    return list_mean([l for l in lst if not math.isnan(l)])
@@ -5,12 +5,12 @@
 import importlib.metadata
 import re
 import time
+import math
 from ast import literal_eval
 from typing import Dict, List
 from urllib.parse import urlparse
 
 import jwt
-import numpy as np
 
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import get_async_http_client
@@ -229,21 +229,21 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         parsed_response = literal_eval(response)
         result = {}
         # Use label instead of score since these are assumed to be boolean results.
-        # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
-        result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
+        # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
+        result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
         result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
 
         if metric_name == EvaluationMetrics.XPIA:
             # Add "manipulated_content", "intrusion" and "information_gathering" to the result
-            # if present else set them to np.nan
+            # if present else set them to math.nan
             result[metric_name + "_manipulated_content"] = (
-                parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
+                parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
             )
             result[metric_name + "_intrusion"] = (
-                parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
+                parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
             )
             result[metric_name + "_information_gathering"] = (
-                parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
+                parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
             )
         return result
     return _parse_content_harm_response(batch_response, metric_name)
@@ -265,7 +265,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
     if key == EvaluationMetrics.HATE_FAIRNESS:
         key = EvaluationMetrics.HATE_UNFAIRNESS
 
-    result = {key: np.nan, key + "_score": np.nan, key + "_reason": ""}
+    result = {key: math.nan, key + "_score": math.nan, key + "_reason": ""}
 
     response = batch_response[0]
     if metric_name not in response:
@@ -285,9 +285,9 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
         if "label" in harm_response:
             metric_value = harm_response["label"]
         elif "valid" in harm_response:
-            metric_value = 0 if harm_response["valid"] else np.nan
+            metric_value = 0 if harm_response["valid"] else math.nan
         else:
-            metric_value = np.nan
+            metric_value = math.nan
 
         # get reason
         if "reasoning" in harm_response:
@@ -301,21 +301,21 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
         if metric_value_match:
             metric_value = int(metric_value_match[0])
         else:
-            metric_value = np.nan
+            metric_value = math.nan
         reason = harm_response
     elif harm_response != "" and isinstance(harm_response, (int, float)):
         if 0 < harm_response <= 7:
             metric_value = harm_response
         else:
-            metric_value = np.nan
+            metric_value = math.nan
         reason = ""
     else:
-        metric_value = np.nan
+        metric_value = math.nan
         reason = ""
 
     harm_score = metric_value
-    if not np.isnan(metric_value):
-        # int(np.nan) causes a value error, and np.nan is already handled
+    if not math.isnan(metric_value):
+        # int(math.nan) causes a value error, and math.nan is already handled
         # by get_harm_severity_level
         harm_score = int(metric_value)
     result[key] = get_harm_severity_level(harm_score)
 
@@ -2,11 +2,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
+import math
 import threading
 from typing import List, Union
 
 import nltk
-import numpy as np
 
 from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
@@ -30,12 +30,12 @@ def get_harm_severity_level(harm_score: int) -> str:
         constants.HarmSeverityLevel.Medium: [4, 5],
         constants.HarmSeverityLevel.High: [6, 7],
     }
-    if harm_score == np.nan or harm_score is None:
-        return np.nan
+    if math.isnan(harm_score) or harm_score is None:
+        return math.nan
     for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
         if harm_score_range[0] <= harm_score <= harm_score_range[1]:
             return harm_level.value
-    return np.nan
+    return math.nan
 
 
 def ensure_nltk_data_downloaded():
 
@@ -3,11 +3,11 @@
 # ---------------------------------------------------------
 import inspect
 import logging
+import math
 import os
 from concurrent.futures import Future
 from typing import Any, Callable, Dict, Optional, Union
 
-import numpy as np
 import pandas as pd
 from promptflow.client import PFClient
 from promptflow.entities import Run
@@ -53,7 +53,7 @@ def run(
     def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
         run: Run = proxy_run.run.result()
         result_df = self._pf_client.get_details(run, all_results=all_results)
-        result_df.replace("(Failed)", np.nan, inplace=True)
+        result_df.replace("(Failed)", math.nan, inplace=True)
         return result_df
 
     def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
 
@@ -6,12 +6,12 @@
 import re
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
 
-import numpy as np
 import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
 from promptflow.client import PFClient
 
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._common.math import list_sum
 
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
@@ -73,7 +73,7 @@ def _aggregate_content_safety_metrics(
         defect_rate_name = col.replace("_score", "_defect_rate")
         col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
         defect_rates[defect_rate_name] = round(
-            np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
+            list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
             / col_with_numeric_values.count(),
             2,
         )
@@ -107,7 +107,7 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         defect_rate_name = col.replace("_label", "_defect_rate")
         col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
         defect_rates[defect_rate_name] = round(
-            np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
+            list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
             2,
         )
     return label_cols, defect_rates
 
@@ -7,10 +7,10 @@
 
 from abc import ABC
 
-import numpy as np
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from azure.ai.evaluation._common.math import list_mean
 
 
 # TODO exception target pass down?
@@ -241,7 +241,7 @@ def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict:
         # Find and average all numeric values
         for metric, values in evaluation_per_turn.items():
             if all(isinstance(value, (int, float)) for value in values):
-                aggregated[metric] = np.mean(values)
+                aggregated[metric] = list_mean(values)
         # Slap the per-turn results back in.
         aggregated["evaluation_per_turn"] = evaluation_per_turn
 
 
@@ -2,15 +2,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
+import math
 import re
 from typing import Dict
 
-from typing_extensions import override
-
-
-import numpy as np
-
 from promptflow.core import AsyncPrompty
+from typing_extensions import override
 
 from ..._common.utils import construct_prompty_model_config
 
@@ -71,7 +68,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict:
         """
         llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
 
-        score = np.nan
+        score = math.nan
         if llm_output:
             match = re.search(r"\d", llm_output)
             if match:
 
@@ -2,12 +2,13 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import logging
+import math
 from concurrent.futures import as_completed
 from typing import Dict, List
 
-import numpy as np
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
 
+from azure.ai.evaluation._common.math import list_mean_nan_safe
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 
 try:
@@ -198,7 +199,7 @@ def _aggregate_results(self, per_turn_results: List[Dict]):
             score_key = f"{metric}_score"
             reason_key = f"{metric}_reason"
 
-            aggregated_score = np.nanmean(scores[score_key])
+            aggregated_score = list_mean_nan_safe(scores[score_key])
             aggregated[metric] = self._get_harm_severity_level(aggregated_score)
             aggregated[score_key] = aggregated_score
 
@@ -291,11 +292,11 @@ def _get_harm_severity_level(self, harm_score: float) -> str:
             "High": [6, 7],
         }
 
-        if harm_score == np.nan or harm_score is None:
-            return np.nan
+        if math.isnan(harm_score) or harm_score is None:
+            return math.nan
 
         for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
             if harm_score_range[0] <= harm_score <= harm_score_range[1]:
                 return harm_level
 
-        return np.nan
+        return math.nan
@@ -4,14 +4,14 @@
 
 import json
 import logging
+import math
 import os
 import re
 
-import numpy as np
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from promptflow.core import AsyncPrompty
 
-
+from ..._common.math import list_mean_nan_safe
 from ..._common.utils import construct_prompty_model_config
 
 logger = logging.getLogger(__name__)
@@ -69,7 +69,7 @@ async def __call__(self, *, conversation, **kwargs):
                 llm_output = await self._flow(
                     query=query, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
                 )
-                score = np.nan
+                score = math.nan
                 if llm_output:
                     parsed_score_response = re.findall(r"\d+", llm_output.split("# Result")[-1].strip())
                     if len(parsed_score_response) > 0:
@@ -82,10 +82,10 @@ async def __call__(self, *, conversation, **kwargs):
                     "Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
                 )
 
-                per_turn_scores.append(np.nan)
+                per_turn_scores.append(math.nan)
 
         return {
-            "gpt_retrieval": np.nanmean(per_turn_scores),
+            "gpt_retrieval": list_mean_nan_safe(per_turn_scores),
             "evaluation_per_turn": {
                 "gpt_retrieval": {
                     "score": per_turn_scores,
 
@@ -2,10 +2,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
+import math
 import os
 import re
 
-import numpy as np
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from promptflow.core import AsyncPrompty
 
@@ -57,7 +57,7 @@ async def __call__(self, *, query: str, response: str, ground_truth: str, **kwar
             query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
         )
 
-        score = np.nan
+        score = math.nan
         if llm_output:
             match = re.search(r"\d", llm_output)
             if match:
 
@@ -67,8 +67,6 @@
     install_requires=[
         "promptflow-devkit>=1.15.0",
         "promptflow-core>=1.15.0",
-        "numpy>=1.23.2; python_version<'3.12'",
-        "numpy>=1.26.4; python_version>='3.12'",
         "pyjwt>=2.8.0",
         "azure-identity>=1.12.0",
         "azure-core>=1.30.2",
 
@@ -1,6 +1,13 @@
 from typing import List
 
-import numpy as np
+
+def median(lst: List[str]) -> float:
+    lst.sort()
+    length = len(lst)
+    if length % 2 == 1:
+        return lst[length // 2]
+    else:
+        return (lst[length // 2 - 1] + lst[length // 2]) / 2
 
 
 class AnswerLength:
@@ -12,5 +19,5 @@ def __call__(self, response: str, **kwargs):
         return {"length": len(response)} if self.return_json else len(response)
 
     def __aggregate__(self, line_results: List[str]) -> dict:
-        median_value = np.median([v.length for v in line_results]) if self.return_json else np.median(line_results)
+        median_value = median([v.length for v in line_results]) if self.return_json else median(line_results)
         return {"median": median_value} if self.aggregate_return_json else median_value