diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index 36d928e9e072..0f6e89e800e4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -486,7 +486,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) aggregated["evaluation_per_turn"] = evaluation_per_turn return aggregated - def _parse_tools_from_response(self, response): + def _parse_tools_from_response(self, response, ensure_arguments=False): """Parse the response to extract tool calls and results. :param response: The response to parse. :type response: Union[str, List[dict]] @@ -505,6 +505,11 @@ def _parse_tools_from_response(self, response): if message.get("role") == "assistant" and isinstance(message.get("content"), list): for content_item in message.get("content"): if isinstance(content_item, dict) and content_item.get("type") == "tool_call": + if ensure_arguments and "arguments" not in content_item: + raise EvaluationException( + message=f"Tool call missing 'arguments' field: {content_item}", + category=ErrorCategory.MISSING_FIELD, + ) tool_calls.append(copy.deepcopy(content_item)) # Extract tool results from tool messages diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py index de49450b4a81..8157ae956420 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py @@ -294,14 +294,14 @@ def _extract_needed_tool_definitions( raise EvaluationException( message=f"Tool definition for {tool_name} not found", blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.NOT_APPLICABLE, target=error_target, ) else: raise EvaluationException( message=f"Tool call missing name: {tool_call}", blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.MISSING_FIELD, target=error_target, ) else: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index cb1b608dcdb6..5832fdd8ba45 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -162,16 +162,43 @@ def _convert_kwargs_to_eval_input(self, **kwargs): query = kwargs.get("query") response = kwargs.get("response") # TODO : Support classes that represents tool calls, messages etc once client side definitions are available + + # Initially try to extract tool calls from the response whether or not tool_calls parameter is provided if response: - parsed_tool_calls = self._parse_tools_from_response(response) - if parsed_tool_calls: - tool_calls = parsed_tool_calls + try: + parsed_tool_calls = self._parse_tools_from_response(response, ensure_arguments=True) + if parsed_tool_calls: + tool_calls = parsed_tool_calls + except EvaluationException as e: + raise EvaluationException( + message=e.message, + category=e.category, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) from e if not tool_calls: - return {"error_message": self._NO_TOOL_CALLS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_CALLS_MESSAGE, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) if not isinstance(tool_calls, list): tool_calls = [tool_calls] + + # Validate that all tool calls have the "arguments" key + for tool_call in tool_calls: + if isinstance(tool_call, dict): + if "arguments" not in tool_call: + raise EvaluationException( + message=f"Tool call missing 'arguments' field: {tool_call}", + category=ErrorCategory.MISSING_FIELD, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) + if not isinstance(tool_definitions, list): tool_definitions = [tool_definitions] if tool_definitions else [] @@ -179,15 +206,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs): needed_tool_definitions = self._extract_needed_tool_definitions( tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR ) - except EvaluationException as e: - # Check if this is because no tool definitions were provided at all - if len(tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - else: - return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} + except EvaluationException: + # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details + raise + # Check if no tool definitions were found at all (including built-in tools) if len(needed_tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_DEFINITIONS_MESSAGE, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) return { "query": query, @@ -227,7 +257,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t raise EvaluationException( message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].", internal_message="Invalid score value.", - category=ErrorCategory.FAILED_EXECUTION, + category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, blame=ErrorBlame.SYSTEM_ERROR, ) @@ -271,9 +301,6 @@ async def _real_call(self, **kwargs): """ # Convert inputs into list of evaluable inputs. eval_input = self._convert_kwargs_to_eval_input(**kwargs) - if isinstance(eval_input, dict) and eval_input.get("error_message"): - # If there is an error message, return not applicable result - return self._not_applicable_result(eval_input.get("error_message"), self.threshold) # Do the evaluation result = await self._do_eval(eval_input) # Return the result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 159e8a5d7410..12332d7ee8e1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -108,11 +108,30 @@ def _convert_kwargs_to_eval_input(self, **kwargs): # Extract tool calls from response if not response: - return {"error_message": "Response parameter is required to extract tool calls."} + raise EvaluationException( + message="Response is required for tool input accuracy evaluation.", + category=ErrorCategory.MISSING_FIELD, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) + + try: + tool_calls = self._parse_tools_from_response(response, ensure_arguments=True) + except EvaluationException as e: + raise EvaluationException( + message=e.message, + category=e.category, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) from e - tool_calls = self._parse_tools_from_response(response) if not tool_calls: - return {"error_message": self._NO_TOOL_CALLS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_CALLS_MESSAGE, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) if not isinstance(tool_calls, list): tool_calls = [tool_calls] @@ -125,15 +144,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs): needed_tool_definitions = self._extract_needed_tool_definitions( tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR ) - except EvaluationException as e: - # Check if this is because no tool definitions were provided at all - if len(tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - else: - return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} + except EvaluationException: + # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details + raise + # Check if no tool definitions were found at all (including built-in tools) if len(needed_tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_DEFINITIONS_MESSAGE, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) # Reformat agent response with tool calls and results using reformat_agent_response agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True) @@ -177,7 +199,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: raise EvaluationException( message=f"Invalid result value: {result}. Expected 0 or 1.", internal_message="Invalid result value.", - category=ErrorCategory.FAILED_EXECUTION, + category=ErrorCategory.INVALID_VALUE, blame=ErrorBlame.SYSTEM_ERROR, ) @@ -224,10 +246,6 @@ async def _real_call(self, **kwargs): """ # Convert inputs into list of evaluable inputs. eval_input = self._convert_kwargs_to_eval_input(**kwargs) - if isinstance(eval_input, dict) and eval_input.get("error_message"): - # If there is an error message, return not applicable result - error_message = eval_input.get("error_message", "Unknown error") - return self._not_applicable_result(error_message, 1) # Do the evaluation result = await self._do_eval(eval_input) # Return the result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 48963fa00d58..06fa2927a7f1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -136,7 +136,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs): tool_calls = parsed_tool_calls if not tool_calls: - return {"error_message": self._NO_TOOL_CALLS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_CALLS_MESSAGE, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.TOOL_SELECTION_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) if not isinstance(tool_calls, list): tool_calls = [tool_calls] @@ -147,15 +152,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs): needed_tool_definitions = self._extract_needed_tool_definitions( tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR ) - except EvaluationException as e: - # Check if this is because no tool definitions were provided at all - if len(tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - else: - return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} + except EvaluationException: + # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details + raise + # Check if no tool definitions were found at all (including built-in tools) if len(needed_tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} + raise EvaluationException( + message=self._NO_TOOL_DEFINITIONS_MESSAGE, + category=ErrorCategory.NOT_APPLICABLE, + target=ErrorTarget.TOOL_SELECTION_EVALUATOR, + blame=ErrorBlame.USER_ERROR, + ) # Extract only tool names from tool calls, removing parameters and results tool_names = self._extract_tool_names_from_calls(tool_calls) @@ -248,9 +256,6 @@ async def _real_call(self, **kwargs): """ # Convert inputs into list of evaluable inputs. eval_input = self._convert_kwargs_to_eval_input(**kwargs) - if isinstance(eval_input, dict) and eval_input.get("error_message"): - return self._not_applicable_result(eval_input.get("error_message"), 1) - result = await self._do_eval(eval_input) return result diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index f84ad64b53b8..5fc04b6e4214 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -2,7 +2,7 @@ import pytest from azure.ai.evaluation import ToolCallAccuracyEvaluator -from azure.ai.evaluation._exceptions import EvaluationException +from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException # This mock should return a dictionary that mimics the output of the prompty (the _flow call), @@ -347,17 +347,16 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - key = ToolCallAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result[f"{key}_details"] == {} + + # Should throw an exception because buy_jacket definition has invalid type (not "function") + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + # The error should mention the specific tool that's missing + # because "another_built_in" is not a valid type (not "function") + assert "buy_jacket" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() - def test_evaluate_tools_all_not_applicable(self, mock_model_config): + def test_evaluate_tools_missing_tool_definitions_throws_exception(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) @@ -387,15 +386,14 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - key = ToolCallAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result[f"{key}_details"] == {} + + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + # The error should mention the specific tool that's missing + assert "fetch_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_tools_no_tools(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -420,15 +418,13 @@ def test_evaluate_tools_no_tools(self, mock_model_config): }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - assert result[f"{key}_details"] == {} + assert "no tool calls found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_bing_custom_search(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -605,12 +601,14 @@ def test_evaluate_open_api(self, mock_model_config): }, ] tool_definitions = [] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" + # OpenAPI is not a simple built-in tool - it requires a full definition with functions + # So calling without tool_definitions should throw an exception + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "openapi" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_open_api_with_tool_definition(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -726,3 +724,84 @@ def test_evaluate_missing_query(self, mock_model_config): evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions) assert "Query is a required input" in str(exc_info.value) + + def test_evaluate_tools_missing_arguments_in_response(self, mock_model_config): + """Test that an exception is raised when response contains tool calls without arguments field.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + query = "What's the weather in Paris?" + # Response with tool call missing the 'arguments' field + response = [ + { + 'role': 'assistant', + 'content': [ + { + 'type': 'tool_call', + 'tool_call_id': 'call_123', + 'name': 'fetch_weather', + # Missing 'arguments' field here + } + ] + } + ] + tool_definitions = [ + { + "name": "fetch_weather", + "type": "function", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, + }, + }, + ] + + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + assert "Tool call missing 'arguments' field" in str(exc_info.value) + assert exc_info.value.category is ErrorCategory.MISSING_FIELD + + def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_config): + """Test that an exception is raised when tool_calls parameter contains calls without arguments field.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + query = "What's the weather in Paris?" + # Tool calls parameter with missing 'arguments' field + tool_calls = [ + { + 'type': 'tool_call', + 'tool_call_id': 'call_123', + 'name': 'fetch_weather', + # Missing 'arguments' field here + } + ] + tool_definitions = [ + { + "name": "fetch_weather", + "type": "function", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, + }, + }, + ] + + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Tool call missing 'arguments' field" in str(exc_info.value) + assert exc_info.value.category is ErrorCategory.MISSING_FIELD diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py index c41193c489ca..8e989da991ac 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py @@ -6,7 +6,7 @@ import pytest from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator -from azure.ai.evaluation._exceptions import EvaluationException +from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException # This mock should return a dictionary that mimics the output of the prompty (the _flow call), @@ -423,15 +423,14 @@ def test_evaluate_no_tool_calls(self, mock_model_config): response = [{"role": "assistant", "content": "I can help you with that."}] tool_definitions = [{"name": "get_weather", "type": "function", "description": "Get weather information"}] - result = evaluator(query=query, response=response, tool_definitions=tool_definitions) + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) - key = _ToolInputAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert _ToolInputAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{key}_reason"] + assert "no tool calls found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE - def test_evaluate_no_tool_definitions(self, mock_model_config): + def test_evaluate_no_tool_definitions_throws_exception(self, mock_model_config): """Test evaluation when no tool definitions are provided.""" evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) @@ -452,15 +451,15 @@ def test_evaluate_no_tool_definitions(self, mock_model_config): ] tool_definitions = [] - result = evaluator(query=query, response=response, tool_definitions=tool_definitions) - - key = _ToolInputAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert _ToolInputAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{key}_reason"] + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + # The error message should mention the specific tool that's missing + assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE - def test_evaluate_missing_tool_definitions(self, mock_model_config): + def test_evaluate_missing_tool_definitions_throws_exception(self, mock_model_config): """Test evaluation when tool definitions are missing for some tool calls.""" evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) @@ -481,13 +480,13 @@ def test_evaluate_missing_tool_definitions(self, mock_model_config): ] tool_definitions = [{"name": "different_function", "type": "function", "description": "A different function"}] - result = evaluator(query=query, response=response, tool_definitions=tool_definitions) - - key = _ToolInputAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert _ToolInputAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{key}_reason"] + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + # The error should mention the specific tool that's missing + assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_invalid_result_value(self, mock_model_config): """Test that invalid result values raise an exception.""" @@ -533,13 +532,13 @@ def test_evaluate_no_response(self, mock_model_config): query = "Get weather" tool_definitions = [{"name": "get_weather", "type": "function", "description": "Get weather information"}] - result = evaluator(query=query, response=None, tool_definitions=tool_definitions) - - key = _ToolInputAccuracyEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert "Response parameter is required" in result[f"{key}_reason"] + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=None, tool_definitions=tool_definitions) + + # The error message should mention the specific tool that's missing + assert "response is required" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.MISSING_FIELD def test_parameter_extraction_accuracy_calculation(self, mock_model_config): """Test the parameter extraction accuracy calculation.""" @@ -695,3 +694,46 @@ def test_evaluate_missing_query(self, mock_model_config): evaluator(response=response, tool_definitions=tool_definitions) assert "Query is a required input" in str(exc_info.value) + + def test_evaluate_missing_arguments_field(self, mock_model_config): + """Test that an exception is raised when response contains tool calls without arguments field.""" + evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + query = "What's the weather in Paris?" + # Response with tool call missing the 'arguments' field + response = [ + { + 'role': 'assistant', + 'content': [ + { + 'type': 'tool_call', + 'tool_call_id': 'call_123', + 'name': 'get_weather', + # Missing 'arguments' field here + } + ] + } + ] + tool_definitions = [ + { + "name": "get_weather", + "type": "function", + "description": "Get weather information for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to get weather for.", + } + }, + }, + }, + ] + + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + assert "Tool call missing 'arguments' field" in str(exc_info.value) + assert exc_info.value.category is ErrorCategory.MISSING_FIELD diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py index bf23c45f5d43..b7e3c36cb6a9 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py @@ -2,7 +2,7 @@ import pytest from azure.ai.evaluation._evaluators._tool_selection import _ToolSelectionEvaluator -from azure.ai.evaluation._exceptions import EvaluationException +from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException # Mock function for Tool Selection evaluator flow side effect @@ -234,29 +234,35 @@ def test_evaluate_tool_selection_fail_no_tools_selected(self, mock_model_config) } ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = _ToolSelectionEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert f"{key}_reason" in result + assert "no tool calls found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE - def test_evaluate_tool_selection_not_applicable_no_tool_definitions(self, mock_model_config): + def test_evaluate_tool_selection_no_tool_definitions_throws_exception(self, mock_model_config): evaluator = _ToolSelectionEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=tool_selection_flow_side_effect) query = "What's the weather like today?" - tool_calls = [] + tool_calls = [ + { + "type": "tool_call", + "tool_call_id": "call_weather", + "name": "get_weather", + "arguments": {"location": "current"}, + } + ] tool_definitions = [] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - - key = _ToolSelectionEvaluator._RESULT_KEY - assert result is not None - assert result[key] == "not applicable" - assert result[f"{key}_result"] == "pass" - assert f"{key}_reason" in result + # Expect an exception to be raised + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + # The error message should mention the specific tool that's missing + assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower() + assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config): evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)