Azure · salma-elshafey · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 18, 2025
@@ -486,7 +486,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
 
-    def _parse_tools_from_response(self, response):
+    def _parse_tools_from_response(self, response, ensure_arguments=False):
         """Parse the response to extract tool calls and results.
         :param response: The response to parse.
         :type response: Union[str, List[dict]]
@@ -505,6 +505,11 @@ def _parse_tools_from_response(self, response):
                 if message.get("role") == "assistant" and isinstance(message.get("content"), list):
                     for content_item in message.get("content"):
                         if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
+                            if ensure_arguments and "arguments" not in content_item:
+                                raise EvaluationException(
+                                    message=f"Tool call missing 'arguments' field: {content_item}",
+                                    category=ErrorCategory.MISSING_FIELD,
+                                )
                             tool_calls.append(copy.deepcopy(content_item))
 
                 # Extract tool results from tool messages

@@ -294,14 +294,14 @@ def _extract_needed_tool_definitions(
                             raise EvaluationException(
                                 message=f"Tool definition for {tool_name} not found",
                                 blame=ErrorBlame.USER_ERROR,
-                                category=ErrorCategory.INVALID_VALUE,
+                                category=ErrorCategory.NOT_APPLICABLE,
                                 target=error_target,
                             )
                     else:
                         raise EvaluationException(
                             message=f"Tool call missing name: {tool_call}",
                             blame=ErrorBlame.USER_ERROR,
-                            category=ErrorCategory.INVALID_VALUE,
+                            category=ErrorCategory.MISSING_FIELD,
                             target=error_target,
                         )
                 else:

@@ -162,32 +162,62 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         query = kwargs.get("query")
         response = kwargs.get("response")
         # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
+
+        # Initially try to extract tool calls from the response whether or not tool_calls parameter is provided
         if response:
-            parsed_tool_calls = self._parse_tools_from_response(response)
-            if parsed_tool_calls:
-                tool_calls = parsed_tool_calls
+            try:
+                parsed_tool_calls = self._parse_tools_from_response(response, ensure_arguments=True)
+                if parsed_tool_calls:
+                    tool_calls = parsed_tool_calls
+            except EvaluationException as e:
+                raise EvaluationException(
+                    message=e.message,
+                    category=e.category,
+                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                    blame=ErrorBlame.USER_ERROR,
+                ) from e
 
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_CALLS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
+
+        # Validate that all tool calls have the "arguments" key
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                if "arguments" not in tool_call:
+                    raise EvaluationException(
+                        message=f"Tool call missing 'arguments' field: {tool_call}",
+                        category=ErrorCategory.MISSING_FIELD,
+                        target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                        blame=ErrorBlame.USER_ERROR,
+                    )
+
         if not isinstance(tool_definitions, list):
             tool_definitions = [tool_definitions] if tool_definitions else []
 
         try:
             needed_tool_definitions = self._extract_needed_tool_definitions(
                 tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
             )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+        except EvaluationException:
+            # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
+            raise
 
+        # Check if no tool definitions were found at all (including built-in tools)
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_DEFINITIONS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         return {
             "query": query,
@@ -227,7 +257,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 raise EvaluationException(
                     message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
                     internal_message="Invalid score value.",
-                    category=ErrorCategory.FAILED_EXECUTION,
+                    category=ErrorCategory.INVALID_VALUE,
                     target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
                     blame=ErrorBlame.SYSTEM_ERROR,
                 )
@@ -271,9 +301,6 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
-        if isinstance(eval_input, dict) and eval_input.get("error_message"):
-            # If there is an error message, return not applicable result
-            return self._not_applicable_result(eval_input.get("error_message"), self.threshold)
         # Do the evaluation
         result = await self._do_eval(eval_input)
         # Return the result

@@ -108,11 +108,30 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
 
         # Extract tool calls from response
         if not response:
-            return {"error_message": "Response parameter is required to extract tool calls."}
+            raise EvaluationException(
+                message="Response is required for tool input accuracy evaluation.",
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
+
+        try:
+            tool_calls = self._parse_tools_from_response(response, ensure_arguments=True)
+        except EvaluationException as e:
+            raise EvaluationException(
+                    message=e.message,
+                    category=e.category,
+                    target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                    blame=ErrorBlame.USER_ERROR,
+            ) from e
 
-        tool_calls = self._parse_tools_from_response(response)
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_CALLS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
@@ -125,15 +144,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             needed_tool_definitions = self._extract_needed_tool_definitions(
                 tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
             )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+        except EvaluationException:
+            # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
+            raise
 
+        # Check if no tool definitions were found at all (including built-in tools)
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_DEFINITIONS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         # Reformat agent response with tool calls and results using reformat_agent_response
         agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True)
@@ -177,7 +199,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
                 raise EvaluationException(
                     message=f"Invalid result value: {result}. Expected 0 or 1.",
                     internal_message="Invalid result value.",
-                    category=ErrorCategory.FAILED_EXECUTION,
+                    category=ErrorCategory.INVALID_VALUE,
                     blame=ErrorBlame.SYSTEM_ERROR,
                 )
 
@@ -224,10 +246,6 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
-        if isinstance(eval_input, dict) and eval_input.get("error_message"):
-            # If there is an error message, return not applicable result
-            error_message = eval_input.get("error_message", "Unknown error")
-            return self._not_applicable_result(error_message, 1)
         # Do the evaluation
         result = await self._do_eval(eval_input)
         # Return the result

@@ -136,7 +136,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                 tool_calls = parsed_tool_calls
 
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_CALLS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
@@ -147,15 +152,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             needed_tool_definitions = self._extract_needed_tool_definitions(
                 tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
             )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+        except EvaluationException:
+            # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
+            raise
 
+        # Check if no tool definitions were found at all (including built-in tools)
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_DEFINITIONS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         # Extract only tool names from tool calls, removing parameters and results
         tool_names = self._extract_tool_names_from_calls(tool_calls)
@@ -248,9 +256,6 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
-        if isinstance(eval_input, dict) and eval_input.get("error_message"):
-            return self._not_applicable_result(eval_input.get("error_message"), 1)
-
         result = await self._do_eval(eval_input)
 
         return result