From 60fb8486be5fd6cbbfaa2dc3e43bb1edb9b33286 Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Tue, 16 Dec 2025 14:34:23 +0200
Subject: [PATCH 1/7] Ensure tool calls in tool call accuracy evaluator include
 the 'arguments' field + test cases

---
 .../_tool_call_accuracy.py                    | 13 +++
 .../test_tool_call_accuracy_evaluator.py      | 83 ++++++++++++++++++-
 2 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index 80c470f02eba..e1cd3651959e 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -172,6 +172,19 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
+        
+        # Validate that all tool calls have the "arguments" key
+        for i, tool_call in enumerate(tool_calls):
+            if isinstance(tool_call, dict):
+                if "arguments" not in tool_call:
+                    raise EvaluationException(
+                        message=f"Tool call at index {i} missing 'arguments' key: {tool_call}",
+                        internal_message="Tool call validation failed - missing arguments field",
+                        category=ErrorCategory.MISSING_FIELD,
+                        target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                        blame=ErrorBlame.USER_ERROR,
+                    )
+        
         if not isinstance(tool_definitions, list):
             tool_definitions = [tool_definitions] if tool_definitions else []
 
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
index 7b82c1beb8c3..43af22dae577 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -2,7 +2,7 @@
 
 import pytest
 from azure.ai.evaluation import ToolCallAccuracyEvaluator
-from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException
 
 
 # This mock should return a dictionary that mimics the output of the prompty (the _flow call),
@@ -688,3 +688,84 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
         assert result is not None
         assert result[key] == 5.0
         assert result[f"{key}_result"] == "pass"
+
+    def test_evaluate_tools_missing_arguments_in_response(self, mock_model_config):
+        """Test that an exception is raised when response contains tool calls without arguments field."""
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        query = "What's the weather in Paris?"
+        # Response with tool call missing the 'arguments' field
+        response = [
+            {
+                'role': 'assistant',
+                'content': [
+                    {
+                        'type': 'tool_call',
+                        'tool_call_id': 'call_123',
+                        'name': 'fetch_weather',
+                        # Missing 'arguments' field here
+                    }
+                ]
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "fetch_weather",
+                "type": "function",
+                "description": "Fetches the weather information for the specified location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The location to fetch weather for.",
+                        }
+                    },
+                },
+            },
+        ]
+
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
+
+        assert "Tool call at index 0 missing 'arguments' key" in str(exc_info.value)
+        assert exc_info.value.category == ErrorCategory.MISSING_FIELD
+
+    def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_config):
+        """Test that an exception is raised when tool_calls parameter contains calls without arguments field."""
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        query = "What's the weather in Paris?"
+        # Tool calls parameter with missing 'arguments' field
+        tool_calls = [
+            {
+                'type': 'tool_call',
+                'tool_call_id': 'call_123',
+                'name': 'fetch_weather',
+                # Missing 'arguments' field here
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "fetch_weather",
+                "type": "function",
+                "description": "Fetches the weather information for the specified location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The location to fetch weather for.",
+                        }
+                    },
+                },
+            },
+        ]
+
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+
+        assert "Tool call at index 0 missing 'arguments' key" in str(exc_info.value)
+        assert exc_info.value.category == ErrorCategory.MISSING_FIELD

From 1802d14cd9e6864d3f45006480db876d6a21eabb Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Tue, 16 Dec 2025 14:51:12 +0200
Subject: [PATCH 2/7] Ensure arguments field in Tool Input Accuracy Evaluator +
 test case

---
 .../_evaluators/_common/_base_eval.py         |  7 +++-
 .../_tool_call_accuracy.py                    | 21 +++++++---
 .../_tool_input_accuracy.py                   | 11 ++++-
 .../test_tool_call_accuracy_evaluator.py      |  8 ++--
 .../test_tool_input_accuracy_evaluator.py     | 42 +++++++++++++++++++
 5 files changed, 76 insertions(+), 13 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
index 36d928e9e072..0f6e89e800e4 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -486,7 +486,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
 
-    def _parse_tools_from_response(self, response):
+    def _parse_tools_from_response(self, response, ensure_arguments=False):
         """Parse the response to extract tool calls and results.
         :param response: The response to parse.
         :type response: Union[str, List[dict]]
@@ -505,6 +505,11 @@ def _parse_tools_from_response(self, response):
                 if message.get("role") == "assistant" and isinstance(message.get("content"), list):
                     for content_item in message.get("content"):
                         if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
+                            if ensure_arguments and "arguments" not in content_item:
+                                raise EvaluationException(
+                                    message=f"Tool call missing 'arguments' field: {content_item}",
+                                    category=ErrorCategory.MISSING_FIELD,
+                                )
                             tool_calls.append(copy.deepcopy(content_item))
 
                 # Extract tool results from tool messages
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index e1cd3651959e..f1caec5e9001 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -162,10 +162,20 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         query = kwargs.get("query")
         response = kwargs.get("response")
         # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
+
+        # Initially try to extract tool calls from the response whether or not tool_calls parameter is provided
         if response:
-            parsed_tool_calls = self._parse_tools_from_response(response)
-            if parsed_tool_calls:
-                tool_calls = parsed_tool_calls
+            try:
+                parsed_tool_calls = self._parse_tools_from_response(response, ensure_arguments=True)
+                if parsed_tool_calls:
+                    tool_calls = parsed_tool_calls
+            except EvaluationException as e:
+                raise EvaluationException(
+                    message=e.message,
+                    category=e.category,
+                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                    blame=ErrorBlame.USER_ERROR,
+                ) from e
 
         if not tool_calls:
             return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
@@ -174,12 +184,11 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             tool_calls = [tool_calls]
         
         # Validate that all tool calls have the "arguments" key
-        for i, tool_call in enumerate(tool_calls):
+        for tool_call in tool_calls:
             if isinstance(tool_call, dict):
                 if "arguments" not in tool_call:
                     raise EvaluationException(
-                        message=f"Tool call at index {i} missing 'arguments' key: {tool_call}",
-                        internal_message="Tool call validation failed - missing arguments field",
+                        message=f"Tool call missing 'arguments' field: {tool_call}",
                         category=ErrorCategory.MISSING_FIELD,
                         target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
                         blame=ErrorBlame.USER_ERROR,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index 006c877e0db2..f73337d949dc 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -110,7 +110,16 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         if not response:
             return {"error_message": "Response parameter is required to extract tool calls."}
 
-        tool_calls = self._parse_tools_from_response(response)
+        try:
+            tool_calls = self._parse_tools_from_response(response, ensure_arguments=True)
+        except EvaluationException as e:
+            raise EvaluationException(
+                    message=e.message,
+                    category=e.category,
+                    target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                    blame=ErrorBlame.USER_ERROR,
+            ) from e
+
         if not tool_calls:
             return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
 
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
index 43af22dae577..c26ab8840e5d 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -2,7 +2,7 @@
 
 import pytest
 from azure.ai.evaluation import ToolCallAccuracyEvaluator
-from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException
+from azure.ai.evaluation._exceptions import EvaluationException
 
 
 # This mock should return a dictionary that mimics the output of the prompty (the _flow call),
@@ -729,8 +729,7 @@ def test_evaluate_tools_missing_arguments_in_response(self, mock_model_config):
         with pytest.raises(EvaluationException) as exc_info:
             evaluator(query=query, response=response, tool_definitions=tool_definitions)
 
-        assert "Tool call at index 0 missing 'arguments' key" in str(exc_info.value)
-        assert exc_info.value.category == ErrorCategory.MISSING_FIELD
+        assert "Tool call missing 'arguments' field" in str(exc_info.value)
 
     def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_config):
         """Test that an exception is raised when tool_calls parameter contains calls without arguments field."""
@@ -767,5 +766,4 @@ def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_c
         with pytest.raises(EvaluationException) as exc_info:
             evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
-        assert "Tool call at index 0 missing 'arguments' key" in str(exc_info.value)
-        assert exc_info.value.category == ErrorCategory.MISSING_FIELD
+        assert "Tool call missing 'arguments' field" in str(exc_info.value)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
index c8da3c223b9a..4e8b250fc317 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
@@ -652,3 +652,45 @@ def test_evaluate_with_single_tool_definition(self, mock_model_config):
         assert result is not None
         assert result[key] == 1
         assert result[f"{key}_result"] == "pass"
+
+    def test_evaluate_missing_arguments_field(self, mock_model_config):
+        """Test that an exception is raised when response contains tool calls without arguments field."""
+        evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        query = "What's the weather in Paris?"
+        # Response with tool call missing the 'arguments' field
+        response = [
+            {
+                'role': 'assistant',
+                'content': [
+                    {
+                        'type': 'tool_call',
+                        'tool_call_id': 'call_123',
+                        'name': 'get_weather',
+                        # Missing 'arguments' field here
+                    }
+                ]
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "get_weather",
+                "type": "function",
+                "description": "Get weather information for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The location to get weather for.",
+                        }
+                    },
+                },
+            },
+        ]
+
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
+
+        assert "Tool call missing 'arguments' field" in str(exc_info.value)

From cc55e2d43402d461e703deabba8b10317d36e8ea Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Tue, 16 Dec 2025 16:11:33 +0200
Subject: [PATCH 3/7] Throw exception in case of insufficient tool definitions
 provided

---
 .../_tool_call_accuracy.py                    | 17 ++++---
 .../_tool_input_accuracy.py                   | 17 ++++---
 .../_tool_selection/_tool_selection.py        | 17 ++++---
 .../test_tool_call_accuracy_evaluator.py      | 46 +++++++++----------
 .../test_tool_input_accuracy_evaluator.py     | 30 ++++++------
 .../test_tool_selection_evaluator.py          | 24 ++++++----
 6 files changed, 81 insertions(+), 70 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index f1caec5e9001..fa5afa3669a3 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -201,15 +201,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             needed_tool_definitions = self._extract_needed_tool_definitions(
                 tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
             )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+        except EvaluationException:
+            # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
+            raise
 
+        # Check if no tool definitions were found at all (including built-in tools)
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_DEFINITIONS_MESSAGE,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         return {
             "query": query,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index f73337d949dc..b2006ad4b9b5 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -134,15 +134,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             needed_tool_definitions = self._extract_needed_tool_definitions(
                 tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
             )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+        except EvaluationException:
+            # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
+            raise
 
+        # Check if no tool definitions were found at all (including built-in tools)
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_DEFINITIONS_MESSAGE,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         # Get agent response with tool calls and results using _get_agent_response
         agent_response_with_tools = _get_agent_response(response, include_tool_messages=True)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
index 616be11b9ee8..0a3cf155889d 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
@@ -147,15 +147,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             needed_tool_definitions = self._extract_needed_tool_definitions(
                 tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
             )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+        except EvaluationException:
+            # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
+            raise
 
+        # Check if no tool definitions were found at all (including built-in tools)
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_DEFINITIONS_MESSAGE,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         # Extract only tool names from tool calls, removing parameters and results
         tool_names = self._extract_tool_names_from_calls(tool_calls)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
index c26ab8840e5d..b6f52fceb6a4 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -347,17 +347,16 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
                 },
             },
         ]
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-
-        key = ToolCallAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
-        assert result[f"{key}_result"] == "pass"
-        assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-        assert result[f"{key}_details"] == {}
+        
+        # Should throw an exception because buy_jacket definition has invalid type (not "function")
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        # The error should mention the specific tool that's missing
+        # because "another_built_in" is not a valid type (not "function")
+        assert "buy_jacket" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
 
-    def test_evaluate_tools_all_not_applicable(self, mock_model_config):
+    def test_evaluate_tools_missing_tool_definitions_throws_exception(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=flow_side_effect)
 
@@ -387,15 +386,13 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
                 },
             },
         ]
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-
-        key = ToolCallAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
-        assert result[f"{key}_result"] == "pass"
-        assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-        assert result[f"{key}_details"] == {}
+        
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        # The error should mention the specific tool that's missing
+        assert "fetch_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
 
     def test_evaluate_tools_no_tools(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -605,12 +602,13 @@ def test_evaluate_open_api(self, mock_model_config):
             },
         ]
         tool_definitions = []
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
-        key = ToolCallAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
+        # OpenAPI is not a simple built-in tool - it requires a full definition with functions
+        # So calling without tool_definitions should throw an exception
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        assert "openapi" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
 
     def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
index 4e8b250fc317..c5ef4a6c371e 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
@@ -431,7 +431,7 @@ def test_evaluate_no_tool_calls(self, mock_model_config):
         assert result[f"{key}_result"] == "pass"
         assert _ToolInputAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{key}_reason"]
 
-    def test_evaluate_no_tool_definitions(self, mock_model_config):
+    def test_evaluate_no_tool_definitions_throws_exception(self, mock_model_config):
         """Test evaluation when no tool definitions are provided."""
         evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=flow_side_effect)
@@ -452,15 +452,14 @@ def test_evaluate_no_tool_definitions(self, mock_model_config):
         ]
         tool_definitions = []
 
-        result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
-
-        key = _ToolInputAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert _ToolInputAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{key}_reason"]
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
+        
+        # The error message should mention the specific tool that's missing
+        assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
 
-    def test_evaluate_missing_tool_definitions(self, mock_model_config):
+    def test_evaluate_missing_tool_definitions_throws_exception(self, mock_model_config):
         """Test evaluation when tool definitions are missing for some tool calls."""
         evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=flow_side_effect)
@@ -481,13 +480,12 @@ def test_evaluate_missing_tool_definitions(self, mock_model_config):
         ]
         tool_definitions = [{"name": "different_function", "type": "function", "description": "A different function"}]
 
-        result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
-
-        key = _ToolInputAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert _ToolInputAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{key}_reason"]
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
+        
+        # The error should mention the specific tool that's missing
+        assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
 
     def test_evaluate_invalid_result_value(self, mock_model_config):
         """Test that invalid result values raise an exception."""
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
index 8390e30c3e4c..0a5be6dc8951 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
@@ -242,21 +242,27 @@ def test_evaluate_tool_selection_fail_no_tools_selected(self, mock_model_config)
         assert result[f"{key}_result"] == "pass"
         assert f"{key}_reason" in result
 
-    def test_evaluate_tool_selection_not_applicable_no_tool_definitions(self, mock_model_config):
+    def test_evaluate_tool_selection_no_tool_definitions_throws_exception(self, mock_model_config):
         evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=tool_selection_flow_side_effect)
 
         query = "What's the weather like today?"
-        tool_calls = []
+        tool_calls = [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_weather",
+                "name": "get_weather",
+                "arguments": {"location": "current"},
+            }
+        ]
         tool_definitions = []
 
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-
-        key = _ToolSelectionEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert f"{key}_reason" in result
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        # The error message should mention the specific tool that's missing
+        assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
 
     def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config):
         evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)

From 0e9a6522fe905a767a1193b6564679122c4bcb26 Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Thu, 18 Dec 2025 14:10:22 +0200
Subject: [PATCH 4/7] Raise not applicable exception in case of no tool
 calls/no(t enough) tool definitions

---
 .../_evaluators/_common/_base_prompty_eval.py |  4 +--
 .../_tool_call_accuracy.py                    | 16 ++++++----
 .../_tool_input_accuracy.py                   | 24 ++++++++------
 .../_tool_selection/_tool_selection.py        | 12 ++++---
 .../test_tool_call_accuracy_evaluator.py      | 21 ++++++------
 .../test_tool_input_accuracy_evaluator.py     | 32 ++++++++++---------
 .../test_tool_selection_evaluator.py          | 17 +++++-----
 7 files changed, 70 insertions(+), 56 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
index de49450b4a81..8157ae956420 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -294,14 +294,14 @@ def _extract_needed_tool_definitions(
                             raise EvaluationException(
                                 message=f"Tool definition for {tool_name} not found",
                                 blame=ErrorBlame.USER_ERROR,
-                                category=ErrorCategory.INVALID_VALUE,
+                                category=ErrorCategory.NOT_APPLICABLE,
                                 target=error_target,
                             )
                     else:
                         raise EvaluationException(
                             message=f"Tool call missing name: {tool_call}",
                             blame=ErrorBlame.USER_ERROR,
-                            category=ErrorCategory.INVALID_VALUE,
+                            category=ErrorCategory.MISSING_FIELD,
                             target=error_target,
                         )
                 else:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index fa5afa3669a3..3351f67af29e 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -178,7 +178,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                 ) from e
 
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_CALLS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
@@ -209,7 +214,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         if len(needed_tool_definitions) == 0:
             raise EvaluationException(
                 message=self._NO_TOOL_DEFINITIONS_MESSAGE,
-                category=ErrorCategory.INVALID_VALUE,
+                category=ErrorCategory.NOT_APPLICABLE,
                 target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
                 blame=ErrorBlame.USER_ERROR,
             )
@@ -243,7 +248,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 raise EvaluationException(
                     message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
                     internal_message="Invalid score value.",
-                    category=ErrorCategory.FAILED_EXECUTION,
+                    category=ErrorCategory.INVALID_VALUE,
                     target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
                     blame=ErrorBlame.SYSTEM_ERROR,
                 )
@@ -273,7 +278,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             raise EvaluationException(
                 message="Tool call accuracy evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
-                category=ErrorCategory.FAILED_EXECUTION,
+                category=ErrorCategory.INVALID_VALUE,
                 target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
             )
 
@@ -287,9 +292,6 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
-        if isinstance(eval_input, dict) and eval_input.get("error_message"):
-            # If there is an error message, return not applicable result
-            return self._not_applicable_result(eval_input.get("error_message"), self.threshold)
         # Do the evaluation
         result = await self._do_eval(eval_input)
         # Return the result
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index b2006ad4b9b5..07c65e2a07af 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -108,7 +108,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
 
         # Extract tool calls from response
         if not response:
-            return {"error_message": "Response parameter is required to extract tool calls."}
+            raise EvaluationException(
+                message="Response is required for tool input accuracy evaluation.",
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         try:
             tool_calls = self._parse_tools_from_response(response, ensure_arguments=True)
@@ -121,7 +126,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             ) from e
 
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_CALLS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
@@ -142,7 +152,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         if len(needed_tool_definitions) == 0:
             raise EvaluationException(
                 message=self._NO_TOOL_DEFINITIONS_MESSAGE,
-                category=ErrorCategory.INVALID_VALUE,
+                category=ErrorCategory.NOT_APPLICABLE,
                 target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
                 blame=ErrorBlame.USER_ERROR,
             )
@@ -181,7 +191,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
                 raise EvaluationException(
                     message=f"Invalid result value: {result}. Expected 0 or 1.",
                     internal_message="Invalid result value.",
-                    category=ErrorCategory.FAILED_EXECUTION,
+                    category=ErrorCategory.INVALID_VALUE,
                     blame=ErrorBlame.SYSTEM_ERROR,
                 )
 
@@ -214,7 +224,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
             raise EvaluationException(
                 message="Tool input accuracy evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
-                category=ErrorCategory.FAILED_EXECUTION,
+                category=ErrorCategory.INVALID_VALUE,
                 target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             )
 
@@ -228,10 +238,6 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
-        if isinstance(eval_input, dict) and eval_input.get("error_message"):
-            # If there is an error message, return not applicable result
-            error_message = eval_input.get("error_message", "Unknown error")
-            return self._not_applicable_result(error_message, 1)
         # Do the evaluation
         result = await self._do_eval(eval_input)
         # Return the result
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
index 0a3cf155889d..1562c0faa0f8 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
@@ -136,7 +136,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                 tool_calls = parsed_tool_calls
 
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_CALLS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
@@ -155,7 +160,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         if len(needed_tool_definitions) == 0:
             raise EvaluationException(
                 message=self._NO_TOOL_DEFINITIONS_MESSAGE,
-                category=ErrorCategory.INVALID_VALUE,
+                category=ErrorCategory.NOT_APPLICABLE,
                 target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
                 blame=ErrorBlame.USER_ERROR,
             )
@@ -243,9 +248,6 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
-        if isinstance(eval_input, dict) and eval_input.get("error_message"):
-            return self._not_applicable_result(eval_input.get("error_message"), 1)
-
         result = await self._do_eval(eval_input)
 
         return result
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
index b6f52fceb6a4..72d73d400418 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -2,7 +2,7 @@
 
 import pytest
 from azure.ai.evaluation import ToolCallAccuracyEvaluator
-from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException
 
 
 # This mock should return a dictionary that mimics the output of the prompty (the _flow call),
@@ -393,6 +393,7 @@ def test_evaluate_tools_missing_tool_definitions_throws_exception(self, mock_mod
         
         # The error should mention the specific tool that's missing
         assert "fetch_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_tools_no_tools(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -417,15 +418,14 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
                 },
             },
         ]
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-
-        key = ToolCallAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
-        assert result[f"{key}_result"] == "pass"
-        assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
-        assert result[f"{key}_details"] == {}
+        
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        # The error should mention the specific tool that's missing
+        assert "no tool calls found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_bing_custom_search(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -609,6 +609,7 @@ def test_evaluate_open_api(self, mock_model_config):
             evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
         
         assert "openapi" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
index c5ef4a6c371e..77ef9186193e 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
@@ -6,7 +6,7 @@
 
 import pytest
 from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
-from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException
 
 
 # This mock should return a dictionary that mimics the output of the prompty (the _flow call),
@@ -423,13 +423,13 @@ def test_evaluate_no_tool_calls(self, mock_model_config):
         response = [{"role": "assistant", "content": "I can help you with that."}]
         tool_definitions = [{"name": "get_weather", "type": "function", "description": "Get weather information"}]
 
-        result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
-
-        key = _ToolInputAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert _ToolInputAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{key}_reason"]
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
+        
+        # The error message should mention the specific tool that's missing
+        assert "no tool calls found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_no_tool_definitions_throws_exception(self, mock_model_config):
         """Test evaluation when no tool definitions are provided."""
@@ -458,6 +458,7 @@ def test_evaluate_no_tool_definitions_throws_exception(self, mock_model_config):
         
         # The error message should mention the specific tool that's missing
         assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_missing_tool_definitions_throws_exception(self, mock_model_config):
         """Test evaluation when tool definitions are missing for some tool calls."""
@@ -486,6 +487,7 @@ def test_evaluate_missing_tool_definitions_throws_exception(self, mock_model_con
         
         # The error should mention the specific tool that's missing
         assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_invalid_result_value(self, mock_model_config):
         """Test that invalid result values raise an exception."""
@@ -531,13 +533,13 @@ def test_evaluate_no_response(self, mock_model_config):
         query = "Get weather"
         tool_definitions = [{"name": "get_weather", "type": "function", "description": "Get weather information"}]
 
-        result = evaluator(query=query, response=None, tool_definitions=tool_definitions)
-
-        key = _ToolInputAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert "Response parameter is required" in result[f"{key}_reason"]
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=None, tool_definitions=tool_definitions)
+        
+        # The error message should mention the specific tool that's missing
+        assert "response is required" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.MISSING_FIELD
 
     def test_parameter_extraction_accuracy_calculation(self, mock_model_config):
         """Test the parameter extraction accuracy calculation."""
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
index 0a5be6dc8951..71e469335aae 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
@@ -2,7 +2,7 @@
 
 import pytest
 from azure.ai.evaluation._evaluators._tool_selection import _ToolSelectionEvaluator
-from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException
 
 
 # Mock function for Tool Selection evaluator flow side effect
@@ -234,13 +234,13 @@ def test_evaluate_tool_selection_fail_no_tools_selected(self, mock_model_config)
             }
         ]
 
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-
-        key = _ToolSelectionEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert f"{key}_reason" in result
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        # The error message should mention the specific tool that's missing
+        assert "no tool calls found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_tool_selection_no_tool_definitions_throws_exception(self, mock_model_config):
         evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)
@@ -263,6 +263,7 @@ def test_evaluate_tool_selection_no_tool_definitions_throws_exception(self, mock
         
         # The error message should mention the specific tool that's missing
         assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config):
         evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)

From 446bbbbaaf998af0a7ee42adf2856c077d6353ca Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Thu, 18 Dec 2025 14:54:27 +0200
Subject: [PATCH 5/7] Update failed output category to failed execution

---
 .../_evaluators/_tool_call_accuracy/_tool_call_accuracy.py      | 2 +-
 .../_evaluators/_tool_input_accuracy/_tool_input_accuracy.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index 3351f67af29e..b65775182eaa 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -278,7 +278,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             raise EvaluationException(
                 message="Tool call accuracy evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
-                category=ErrorCategory.INVALID_VALUE,
+                category=ErrorCategory.FAILED_EXECUTION,
                 target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
             )
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index 07c65e2a07af..f9b4d897443f 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -224,7 +224,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
             raise EvaluationException(
                 message="Tool input accuracy evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
-                category=ErrorCategory.INVALID_VALUE,
+                category=ErrorCategory.FAILED_EXECUTION,
                 target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             )
 

From 8d1e93bdfe8702f6f9a6c136200c84114fd47386 Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Thu, 18 Dec 2025 16:11:07 +0200
Subject: [PATCH 6/7] Add error category verification in tests

---
 .../tests/unittests/test_tool_call_accuracy_evaluator.py        | 2 ++
 .../tests/unittests/test_tool_input_accuracy_evaluator.py       | 1 +
 2 files changed, 3 insertions(+)

diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
index 72d73d400418..87045df2c08a 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -729,6 +729,7 @@ def test_evaluate_tools_missing_arguments_in_response(self, mock_model_config):
             evaluator(query=query, response=response, tool_definitions=tool_definitions)
 
         assert "Tool call missing 'arguments' field" in str(exc_info.value)
+        assert exc_info.value.category is ErrorCategory.MISSING_FIELD
 
     def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_config):
         """Test that an exception is raised when tool_calls parameter contains calls without arguments field."""
@@ -766,3 +767,4 @@ def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_c
             evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
         assert "Tool call missing 'arguments' field" in str(exc_info.value)
+        assert exc_info.value.category is ErrorCategory.MISSING_FIELD
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
index 77ef9186193e..574acf6f9b14 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
@@ -694,3 +694,4 @@ def test_evaluate_missing_arguments_field(self, mock_model_config):
             evaluator(query=query, response=response, tool_definitions=tool_definitions)
 
         assert "Tool call missing 'arguments' field" in str(exc_info.value)
+        assert exc_info.value.category is ErrorCategory.MISSING_FIELD

From 7b6e70e9d86194331a673fcaa5e63532bd1cf563 Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Thu, 18 Dec 2025 16:13:34 +0200
Subject: [PATCH 7/7] Update test comments

---
 .../tests/unittests/test_tool_call_accuracy_evaluator.py       | 3 +--
 .../tests/unittests/test_tool_input_accuracy_evaluator.py      | 3 +--
 .../tests/unittests/test_tool_selection_evaluator.py           | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
index 87045df2c08a..7c0a158052fe 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -422,8 +422,7 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
         # Expect an exception to be raised
         with pytest.raises(EvaluationException) as exc_info:
             evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-        
-        # The error should mention the specific tool that's missing
+
         assert "no tool calls found" in str(exc_info.value).lower()
         assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
index 574acf6f9b14..2f0670cec7eb 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
@@ -426,8 +426,7 @@ def test_evaluate_no_tool_calls(self, mock_model_config):
         # Expect an exception to be raised
         with pytest.raises(EvaluationException) as exc_info:
             evaluator(query=query, response=response, tool_definitions=tool_definitions)
-        
-        # The error message should mention the specific tool that's missing
+
         assert "no tool calls found" in str(exc_info.value).lower()
         assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
index 71e469335aae..e3ba055df1e3 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
@@ -237,8 +237,7 @@ def test_evaluate_tool_selection_fail_no_tools_selected(self, mock_model_config)
         # Expect an exception to be raised
         with pytest.raises(EvaluationException) as exc_info:
             evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-        
-        # The error message should mention the specific tool that's missing
+
         assert "no tool calls found" in str(exc_info.value).lower()
         assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE