From 60fb8486be5fd6cbbfaa2dc3e43bb1edb9b33286 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Tue, 16 Dec 2025 14:34:23 +0200 Subject: [PATCH 1/7] Ensure tool calls in tool call accuracy evaluator include the 'arguments' field + test cases --- .../_tool_call_accuracy.py | 13 +++ .../test_tool_call_accuracy_evaluator.py | 83 ++++++++++++++++++- 2 files changed, 95 insertions(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 80c470f02eba..e1cd3651959e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -172,6 +172,19 @@ def _convert_kwargs_to_eval_input(self, **kwargs): if not isinstance(tool_calls, list): tool_calls = [tool_calls] + + # Validate that all tool calls have the "arguments" key + for i, tool_call in enumerate(tool_calls): + if isinstance(tool_call, dict): + if "arguments" not in tool_call: + raise EvaluationException( + message=f"Tool call at index {i} missing 'arguments' key: {tool_call}", + internal_message="Tool call validation failed - missing arguments field", + category=ErrorCategory.MISSING_FIELD, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) + if not isinstance(tool_definitions, list): tool_definitions = [tool_definitions] if tool_definitions else [] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 7b82c1beb8c3..43af22dae577 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -2,7 +2,7 @@ import pytest from azure.ai.evaluation import ToolCallAccuracyEvaluator -from azure.ai.evaluation._exceptions import EvaluationException +from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException # This mock should return a dictionary that mimics the output of the prompty (the _flow call), @@ -688,3 +688,84 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config): assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" + + def test_evaluate_tools_missing_arguments_in_response(self, mock_model_config): + """Test that an exception is raised when response contains tool calls without arguments field.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + query = "What's the weather in Paris?" + # Response with tool call missing the 'arguments' field + response = [ + { + 'role': 'assistant', + 'content': [ + { + 'type': 'tool_call', + 'tool_call_id': 'call_123', + 'name': 'fetch_weather', + # Missing 'arguments' field here + } + ] + } + ] + tool_definitions = [ + { + "name": "fetch_weather", + "type": "function", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, + }, + }, + ] + + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + assert "Tool call at index 0 missing 'arguments' key" in str(exc_info.value) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_config): + """Test that an exception is raised when tool_calls parameter contains calls without arguments field.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + query = "What's the weather in Paris?" + # Tool calls parameter with missing 'arguments' field + tool_calls = [ + { + 'type': 'tool_call', + 'tool_call_id': 'call_123', + 'name': 'fetch_weather', + # Missing 'arguments' field here + } + ] + tool_definitions = [ + { + "name": "fetch_weather", + "type": "function", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, + }, + }, + ] + + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Tool call at index 0 missing 'arguments' key" in str(exc_info.value) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD From 1802d14cd9e6864d3f45006480db876d6a21eabb Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Tue, 16 Dec 2025 14:51:12 +0200 Subject: [PATCH 2/7] Ensure arguments field in Tool Input Accuracy Evaluator + test case --- .../_evaluators/_common/_base_eval.py | 7 +++- .../_tool_call_accuracy.py | 21 +++++++--- .../_tool_input_accuracy.py | 11 ++++- .../test_tool_call_accuracy_evaluator.py | 8 ++-- .../test_tool_input_accuracy_evaluator.py | 42 +++++++++++++++++++ 5 files changed, 76 insertions(+), 13 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index 36d928e9e072..0f6e89e800e4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -486,7 +486,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) aggregated["evaluation_per_turn"] = evaluation_per_turn return aggregated - def _parse_tools_from_response(self, response): + def _parse_tools_from_response(self, response, ensure_arguments=False): """Parse the response to extract tool calls and results. :param response: The response to parse. :type response: Union[str, List[dict]] @@ -505,6 +505,11 @@ def _parse_tools_from_response(self, response): if message.get("role") == "assistant" and isinstance(message.get("content"), list): for content_item in message.get("content"): if isinstance(content_item, dict) and content_item.get("type") == "tool_call": + if ensure_arguments and "arguments" not in content_item: + raise EvaluationException( + message=f"Tool call missing 'arguments' field: {content_item}", + category=ErrorCategory.MISSING_FIELD, + ) tool_calls.append(copy.deepcopy(content_item)) # Extract tool results from tool messages diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index e1cd3651959e..f1caec5e9001 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -162,10 +162,20 @@ def _convert_kwargs_to_eval_input(self, **kwargs): query = kwargs.get("query") response = kwargs.get("response") # TODO : Support classes that represents tool calls, messages etc once client side definitions are available + + # Initially try to extract tool calls from the response whether or not tool_calls parameter is provided if response: - parsed_tool_calls = self._parse_tools_from_response(response) - if parsed_tool_calls: - tool_calls = parsed_tool_calls + try: + parsed_tool_calls = self._parse_tools_from_response(response, ensure_arguments=True) + if parsed_tool_calls: + tool_calls = parsed_tool_calls + except EvaluationException as e: + raise EvaluationException( + message=e.message, + category=e.category, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) from e if not tool_calls: return {"error_message": self._NO_TOOL_CALLS_MESSAGE} @@ -174,12 +184,11 @@ def _convert_kwargs_to_eval_input(self, **kwargs): tool_calls = [tool_calls] # Validate that all tool calls have the "arguments" key - for i, tool_call in enumerate(tool_calls): + for tool_call in tool_calls: if isinstance(tool_call, dict): if "arguments" not in tool_call: raise EvaluationException( - message=f"Tool call at index {i} missing 'arguments' key: {tool_call}", - internal_message="Tool call validation failed - missing arguments field", + message=f"Tool call missing 'arguments' field: {tool_call}", category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, blame=ErrorBlame.USER_ERROR, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 006c877e0db2..f73337d949dc 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -110,7 +110,16 @@ def _convert_kwargs_to_eval_input(self, **kwargs): if not response: return {"error_message": "Response parameter is required to extract tool calls."} - tool_calls = self._parse_tools_from_response(response) + try: + tool_calls = self._parse_tools_from_response(response, ensure_arguments=True) + except EvaluationException as e: + raise EvaluationException( + message=e.message, + category=e.category, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) from e + if not tool_calls: return {"error_message": self._NO_TOOL_CALLS_MESSAGE} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 43af22dae577..c26ab8840e5d 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -2,7 +2,7 @@ import pytest from azure.ai.evaluation import ToolCallAccuracyEvaluator -from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException +from azure.ai.evaluation._exceptions import EvaluationException # This mock should return a dictionary that mimics the output of the prompty (the _flow call), @@ -729,8 +729,7 @@ def test_evaluate_tools_missing_arguments_in_response(self, mock_model_config): with pytest.raises(EvaluationException) as exc_info: evaluator(query=query, response=response, tool_definitions=tool_definitions) - assert "Tool call at index 0 missing 'arguments' key" in str(exc_info.value) - assert exc_info.value.category == ErrorCategory.MISSING_FIELD + assert "Tool call missing 'arguments' field" in str(exc_info.value) def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_config): """Test that an exception is raised when tool_calls parameter contains calls without arguments field.""" @@ -767,5 +766,4 @@ def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_c with pytest.raises(EvaluationException) as exc_info: evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - assert "Tool call at index 0 missing 'arguments' key" in str(exc_info.value) - assert exc_info.value.category == ErrorCategory.MISSING_FIELD + assert "Tool call missing 'arguments' field" in str(exc_info.value) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py index c8da3c223b9a..4e8b250fc317 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py @@ -652,3 +652,45 @@ def test_evaluate_with_single_tool_definition(self, mock_model_config): assert result is not None assert result[key] == 1 assert result[f"{key}_result"] == "pass" + + def test_evaluate_missing_arguments_field(self, mock_model_config): + """Test that an exception is raised when response contains tool calls without arguments field.""" + evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + query = "What's the weather in Paris?" + # Response with tool call missing the 'arguments' field + response = [ + { + 'role': 'assistant', + 'content': [ + { + 'type': 'tool_call', + 'tool_call_id': 'call_123', + 'name': 'get_weather', + # Missing 'arguments' field here + } + ] + } + ] + tool_definitions = [ + { + "name": "get_weather", + "type": "function", + "description": "Get weather information for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to get weather for.", + } + }, + }, + }, + ] + + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + assert "Tool call missing 'arguments' field" in str(exc_info.value) From cc55e2d43402d461e703deabba8b10317d36e8ea Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Tue, 16 Dec 2025 16:11:33 +0200 Subject: [PATCH 3/7] Throw exception in case of insufficient tool definitions provided --- .../_tool_call_accuracy.py | 17 ++++--- .../_tool_input_accuracy.py | 17 ++++--- .../_tool_selection/_tool_selection.py | 17 ++++--- .../test_tool_call_accuracy_evaluator.py | 46 +++++++++---------- .../test_tool_input_accuracy_evaluator.py | 30 ++++++------ .../test_tool_selection_evaluator.py | 24 ++++++---- 6 files changed, 81 insertions(+), 70 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index f1caec5e9001..fa5afa3669a3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -201,15 +201,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs): needed_tool_definitions = self._extract_needed_tool_definitions( tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR ) - except EvaluationException as e: - # Check if this is because no tool definitions were provided at all - if len(tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - else: - return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} + except EvaluationException: + # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details + raise + # Check if no tool definitions were found at all (including built-in tools) if len(needed_tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_DEFINITIONS_MESSAGE, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) return { "query": query, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index f73337d949dc..b2006ad4b9b5 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -134,15 +134,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs): needed_tool_definitions = self._extract_needed_tool_definitions( tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR ) - except EvaluationException as e: - # Check if this is because no tool definitions were provided at all - if len(tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - else: - return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} + except EvaluationException: + # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details + raise + # Check if no tool definitions were found at all (including built-in tools) if len(needed_tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_DEFINITIONS_MESSAGE, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) # Get agent response with tool calls and results using _get_agent_response agent_response_with_tools = _get_agent_response(response, include_tool_messages=True) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 616be11b9ee8..0a3cf155889d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -147,15 +147,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs): needed_tool_definitions = self._extract_needed_tool_definitions( tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR ) - except EvaluationException as e: - # Check if this is because no tool definitions were provided at all - if len(tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - else: - return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} + except EvaluationException: + # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details + raise + # Check if no tool definitions were found at all (including built-in tools) if len(needed_tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_DEFINITIONS_MESSAGE, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_SELECTION_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) # Extract only tool names from tool calls, removing parameters and results tool_names = self._extract_tool_names_from_calls(tool_calls) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index c26ab8840e5d..b6f52fceb6a4 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -347,17 +347,16 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - key = ToolCallAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result[f"{key}_details"] == {} + + # Should throw an exception because buy_jacket definition has invalid type (not "function") + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + # The error should mention the specific tool that's missing + # because "another_built_in" is not a valid type (not "function") + assert "buy_jacket" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() - def test_evaluate_tools_all_not_applicable(self, mock_model_config): + def test_evaluate_tools_missing_tool_definitions_throws_exception(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) @@ -387,15 +386,13 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - key = ToolCallAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result[f"{key}_details"] == {} + + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + # The error should mention the specific tool that's missing + assert "fetch_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() def test_evaluate_tools_no_tools(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -605,12 +602,13 @@ def test_evaluate_open_api(self, mock_model_config): }, ] tool_definitions = [] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" + # OpenAPI is not a simple built-in tool - it requires a full definition with functions + # So calling without tool_definitions should throw an exception + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "openapi" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() def test_evaluate_open_api_with_tool_definition(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py index 4e8b250fc317..c5ef4a6c371e 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py @@ -431,7 +431,7 @@ def test_evaluate_no_tool_calls(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert _ToolInputAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{key}_reason"] - def test_evaluate_no_tool_definitions(self, mock_model_config): + def test_evaluate_no_tool_definitions_throws_exception(self, mock_model_config): """Test evaluation when no tool definitions are provided.""" evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) @@ -452,15 +452,14 @@ def test_evaluate_no_tool_definitions(self, mock_model_config): ] tool_definitions = [] - result = evaluator(query=query, response=response, tool_definitions=tool_definitions) - - key = _ToolInputAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert _ToolInputAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{key}_reason"] + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + # The error message should mention the specific tool that's missing + assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() - def test_evaluate_missing_tool_definitions(self, mock_model_config): + def test_evaluate_missing_tool_definitions_throws_exception(self, mock_model_config): """Test evaluation when tool definitions are missing for some tool calls.""" evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) @@ -481,13 +480,12 @@ def test_evaluate_missing_tool_definitions(self, mock_model_config): ] tool_definitions = [{"name": "different_function", "type": "function", "description": "A different function"}] - result = evaluator(query=query, response=response, tool_definitions=tool_definitions) - - key = _ToolInputAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert _ToolInputAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{key}_reason"] + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + # The error should mention the specific tool that's missing + assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() def test_evaluate_invalid_result_value(self, mock_model_config): """Test that invalid result values raise an exception.""" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py index 8390e30c3e4c..0a5be6dc8951 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py @@ -242,21 +242,27 @@ def test_evaluate_tool_selection_fail_no_tools_selected(self, mock_model_config) assert result[f"{key}_result"] == "pass" assert f"{key}_reason" in result - def test_evaluate_tool_selection_not_applicable_no_tool_definitions(self, mock_model_config): + def test_evaluate_tool_selection_no_tool_definitions_throws_exception(self, mock_model_config): evaluator = _ToolSelectionEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=tool_selection_flow_side_effect) query = "What's the weather like today?" - tool_calls = [] + tool_calls = [ + { + "type": "tool_call", + "tool_call_id": "call_weather", + "name": "get_weather", + "arguments": {"location": "current"}, + } + ] tool_definitions = [] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - key = _ToolSelectionEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert f"{key}_reason" in result + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + # The error message should mention the specific tool that's missing + assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config): evaluator = _ToolSelectionEvaluator(model_config=mock_model_config) From 0e9a6522fe905a767a1193b6564679122c4bcb26 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Thu, 18 Dec 2025 14:10:22 +0200 Subject: [PATCH 4/7] Raise not applicable exception in case of no tool calls/no(t enough) tool definitions --- .../_evaluators/_common/_base_prompty_eval.py | 4 +-- .../_tool_call_accuracy.py | 16 ++++++---- .../_tool_input_accuracy.py | 24 ++++++++------ .../_tool_selection/_tool_selection.py | 12 ++++--- .../test_tool_call_accuracy_evaluator.py | 21 ++++++------ .../test_tool_input_accuracy_evaluator.py | 32 ++++++++++--------- .../test_tool_selection_evaluator.py | 17 +++++----- 7 files changed, 70 insertions(+), 56 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py index de49450b4a81..8157ae956420 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py @@ -294,14 +294,14 @@ def _extract_needed_tool_definitions( raise EvaluationException( message=f"Tool definition for {tool_name} not found", blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.NOT_APPLICABLE, target=error_target, ) else: raise EvaluationException( message=f"Tool call missing name: {tool_call}", blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.MISSING_FIELD, target=error_target, ) else: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index fa5afa3669a3..3351f67af29e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -178,7 +178,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs): ) from e if not tool_calls: - return {"error_message": self._NO_TOOL_CALLS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_CALLS_MESSAGE, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) if not isinstance(tool_calls, list): tool_calls = [tool_calls] @@ -209,7 +214,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs): if len(needed_tool_definitions) == 0: raise EvaluationException( message=self._NO_TOOL_DEFINITIONS_MESSAGE, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.NOT_APPLICABLE, target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, blame=ErrorBlame.USER_ERROR, ) @@ -243,7 +248,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t raise EvaluationException( message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].", internal_message="Invalid score value.", - category=ErrorCategory.FAILED_EXECUTION, + category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, blame=ErrorBlame.SYSTEM_ERROR, ) @@ -273,7 +278,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t raise EvaluationException( message="Tool call accuracy evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.FAILED_EXECUTION, + category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, ) @@ -287,9 +292,6 @@ async def _real_call(self, **kwargs): """ # Convert inputs into list of evaluable inputs. eval_input = self._convert_kwargs_to_eval_input(**kwargs) - if isinstance(eval_input, dict) and eval_input.get("error_message"): - # If there is an error message, return not applicable result - return self._not_applicable_result(eval_input.get("error_message"), self.threshold) # Do the evaluation result = await self._do_eval(eval_input) # Return the result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index b2006ad4b9b5..07c65e2a07af 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -108,7 +108,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs): # Extract tool calls from response if not response: - return {"error_message": "Response parameter is required to extract tool calls."} + raise EvaluationException( + message="Response is required for tool input accuracy evaluation.", + category=ErrorCategory.MISSING_FIELD, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) try: tool_calls = self._parse_tools_from_response(response, ensure_arguments=True) @@ -121,7 +126,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs): ) from e if not tool_calls: - return {"error_message": self._NO_TOOL_CALLS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_CALLS_MESSAGE, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) if not isinstance(tool_calls, list): tool_calls = [tool_calls] @@ -142,7 +152,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs): if len(needed_tool_definitions) == 0: raise EvaluationException( message=self._NO_TOOL_DEFINITIONS_MESSAGE, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.NOT_APPLICABLE, target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, blame=ErrorBlame.USER_ERROR, ) @@ -181,7 +191,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: raise EvaluationException( message=f"Invalid result value: {result}. Expected 0 or 1.", internal_message="Invalid result value.", - category=ErrorCategory.FAILED_EXECUTION, + category=ErrorCategory.INVALID_VALUE, blame=ErrorBlame.SYSTEM_ERROR, ) @@ -214,7 +224,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: raise EvaluationException( message="Tool input accuracy evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.FAILED_EXECUTION, + category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, ) @@ -228,10 +238,6 @@ async def _real_call(self, **kwargs): """ # Convert inputs into list of evaluable inputs. eval_input = self._convert_kwargs_to_eval_input(**kwargs) - if isinstance(eval_input, dict) and eval_input.get("error_message"): - # If there is an error message, return not applicable result - error_message = eval_input.get("error_message", "Unknown error") - return self._not_applicable_result(error_message, 1) # Do the evaluation result = await self._do_eval(eval_input) # Return the result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 0a3cf155889d..1562c0faa0f8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -136,7 +136,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs): tool_calls = parsed_tool_calls if not tool_calls: - return {"error_message": self._NO_TOOL_CALLS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_CALLS_MESSAGE, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.TOOL_SELECTION_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) if not isinstance(tool_calls, list): tool_calls = [tool_calls] @@ -155,7 +160,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs): if len(needed_tool_definitions) == 0: raise EvaluationException( message=self._NO_TOOL_DEFINITIONS_MESSAGE, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.NOT_APPLICABLE, target=ErrorTarget.TOOL_SELECTION_EVALUATOR, blame=ErrorBlame.USER_ERROR, ) @@ -243,9 +248,6 @@ async def _real_call(self, **kwargs): """ # Convert inputs into list of evaluable inputs. eval_input = self._convert_kwargs_to_eval_input(**kwargs) - if isinstance(eval_input, dict) and eval_input.get("error_message"): - return self._not_applicable_result(eval_input.get("error_message"), 1) - result = await self._do_eval(eval_input) return result diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index b6f52fceb6a4..72d73d400418 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -2,7 +2,7 @@ import pytest from azure.ai.evaluation import ToolCallAccuracyEvaluator -from azure.ai.evaluation._exceptions import EvaluationException +from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException # This mock should return a dictionary that mimics the output of the prompty (the _flow call), @@ -393,6 +393,7 @@ def test_evaluate_tools_missing_tool_definitions_throws_exception(self, mock_mod # The error should mention the specific tool that's missing assert "fetch_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_tools_no_tools(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -417,15 +418,14 @@ def test_evaluate_tools_no_tools(self, mock_model_config): }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - key = ToolCallAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - assert result[f"{key}_details"] == {} + + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + # The error should mention the specific tool that's missing + assert "no tool calls found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_bing_custom_search(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -609,6 +609,7 @@ def test_evaluate_open_api(self, mock_model_config): evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) assert "openapi" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_open_api_with_tool_definition(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py index c5ef4a6c371e..77ef9186193e 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py @@ -6,7 +6,7 @@ import pytest from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator -from azure.ai.evaluation._exceptions import EvaluationException +from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException # This mock should return a dictionary that mimics the output of the prompty (the _flow call), @@ -423,13 +423,13 @@ def test_evaluate_no_tool_calls(self, mock_model_config): response = [{"role": "assistant", "content": "I can help you with that."}] tool_definitions = [{"name": "get_weather", "type": "function", "description": "Get weather information"}] - result = evaluator(query=query, response=response, tool_definitions=tool_definitions) - - key = _ToolInputAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert _ToolInputAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{key}_reason"] + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + # The error message should mention the specific tool that's missing + assert "no tool calls found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_no_tool_definitions_throws_exception(self, mock_model_config): """Test evaluation when no tool definitions are provided.""" @@ -458,6 +458,7 @@ def test_evaluate_no_tool_definitions_throws_exception(self, mock_model_config): # The error message should mention the specific tool that's missing assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_missing_tool_definitions_throws_exception(self, mock_model_config): """Test evaluation when tool definitions are missing for some tool calls.""" @@ -486,6 +487,7 @@ def test_evaluate_missing_tool_definitions_throws_exception(self, mock_model_con # The error should mention the specific tool that's missing assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_invalid_result_value(self, mock_model_config): """Test that invalid result values raise an exception.""" @@ -531,13 +533,13 @@ def test_evaluate_no_response(self, mock_model_config): query = "Get weather" tool_definitions = [{"name": "get_weather", "type": "function", "description": "Get weather information"}] - result = evaluator(query=query, response=None, tool_definitions=tool_definitions) - - key = _ToolInputAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert "Response parameter is required" in result[f"{key}_reason"] + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=None, tool_definitions=tool_definitions) + + # The error message should mention the specific tool that's missing + assert "response is required" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.MISSING_FIELD def test_parameter_extraction_accuracy_calculation(self, mock_model_config): """Test the parameter extraction accuracy calculation.""" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py index 0a5be6dc8951..71e469335aae 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py @@ -2,7 +2,7 @@ import pytest from azure.ai.evaluation._evaluators._tool_selection import _ToolSelectionEvaluator -from azure.ai.evaluation._exceptions import EvaluationException +from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException # Mock function for Tool Selection evaluator flow side effect @@ -234,13 +234,13 @@ def test_evaluate_tool_selection_fail_no_tools_selected(self, mock_model_config) } ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - key = _ToolSelectionEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert f"{key}_reason" in result + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + # The error message should mention the specific tool that's missing + assert "no tool calls found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_tool_selection_no_tool_definitions_throws_exception(self, mock_model_config): evaluator = _ToolSelectionEvaluator(model_config=mock_model_config) @@ -263,6 +263,7 @@ def test_evaluate_tool_selection_no_tool_definitions_throws_exception(self, mock # The error message should mention the specific tool that's missing assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config): evaluator = _ToolSelectionEvaluator(model_config=mock_model_config) From 446bbbbaaf998af0a7ee42adf2856c077d6353ca Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Thu, 18 Dec 2025 14:54:27 +0200 Subject: [PATCH 5/7] Update failed output category to failed execution --- .../_evaluators/_tool_call_accuracy/_tool_call_accuracy.py | 2 +- .../_evaluators/_tool_input_accuracy/_tool_input_accuracy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3351f67af29e..b65775182eaa 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -278,7 +278,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t raise EvaluationException( message="Tool call accuracy evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.FAILED_EXECUTION, target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 07c65e2a07af..f9b4d897443f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -224,7 +224,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: raise EvaluationException( message="Tool input accuracy evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.FAILED_EXECUTION, target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, ) From 8d1e93bdfe8702f6f9a6c136200c84114fd47386 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Thu, 18 Dec 2025 16:11:07 +0200 Subject: [PATCH 6/7] Add error category verification in tests --- .../tests/unittests/test_tool_call_accuracy_evaluator.py | 2 ++ .../tests/unittests/test_tool_input_accuracy_evaluator.py | 1 + 2 files changed, 3 insertions(+) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 72d73d400418..87045df2c08a 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -729,6 +729,7 @@ def test_evaluate_tools_missing_arguments_in_response(self, mock_model_config): evaluator(query=query, response=response, tool_definitions=tool_definitions) assert "Tool call missing 'arguments' field" in str(exc_info.value) + assert exc_info.value.category is ErrorCategory.MISSING_FIELD def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_config): """Test that an exception is raised when tool_calls parameter contains calls without arguments field.""" @@ -766,3 +767,4 @@ def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_c evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) assert "Tool call missing 'arguments' field" in str(exc_info.value) + assert exc_info.value.category is ErrorCategory.MISSING_FIELD diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py index 77ef9186193e..574acf6f9b14 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py @@ -694,3 +694,4 @@ def test_evaluate_missing_arguments_field(self, mock_model_config): evaluator(query=query, response=response, tool_definitions=tool_definitions) assert "Tool call missing 'arguments' field" in str(exc_info.value) + assert exc_info.value.category is ErrorCategory.MISSING_FIELD From 7b6e70e9d86194331a673fcaa5e63532bd1cf563 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Thu, 18 Dec 2025 16:13:34 +0200 Subject: [PATCH 7/7] Update test comments --- .../tests/unittests/test_tool_call_accuracy_evaluator.py | 3 +-- .../tests/unittests/test_tool_input_accuracy_evaluator.py | 3 +-- .../tests/unittests/test_tool_selection_evaluator.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 87045df2c08a..7c0a158052fe 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -422,8 +422,7 @@ def test_evaluate_tools_no_tools(self, mock_model_config): # Expect an exception to be raised with pytest.raises(EvaluationException) as exc_info: evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - # The error should mention the specific tool that's missing + assert "no tool calls found" in str(exc_info.value).lower() assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py index 574acf6f9b14..2f0670cec7eb 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py @@ -426,8 +426,7 @@ def test_evaluate_no_tool_calls(self, mock_model_config): # Expect an exception to be raised with pytest.raises(EvaluationException) as exc_info: evaluator(query=query, response=response, tool_definitions=tool_definitions) - - # The error message should mention the specific tool that's missing + assert "no tool calls found" in str(exc_info.value).lower() assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py index 71e469335aae..e3ba055df1e3 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py @@ -237,8 +237,7 @@ def test_evaluate_tool_selection_fail_no_tools_selected(self, mock_model_config) # Expect an exception to be raised with pytest.raises(EvaluationException) as exc_info: evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - # The error message should mention the specific tool that's missing + assert "no tool calls found" in str(exc_info.value).lower() assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE