diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
index 36d928e9e072..0f6e89e800e4 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -486,7 +486,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
 
-    def _parse_tools_from_response(self, response):
+    def _parse_tools_from_response(self, response, ensure_arguments=False):
         """Parse the response to extract tool calls and results.
         :param response: The response to parse.
         :type response: Union[str, List[dict]]
@@ -505,6 +505,11 @@ def _parse_tools_from_response(self, response):
                 if message.get("role") == "assistant" and isinstance(message.get("content"), list):
                     for content_item in message.get("content"):
                         if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
+                            if ensure_arguments and "arguments" not in content_item:
+                                raise EvaluationException(
+                                    message=f"Tool call missing 'arguments' field: {content_item}",
+                                    category=ErrorCategory.MISSING_FIELD,
+                                )
                             tool_calls.append(copy.deepcopy(content_item))
 
                 # Extract tool results from tool messages
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
index de49450b4a81..8157ae956420 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -294,14 +294,14 @@ def _extract_needed_tool_definitions(
                             raise EvaluationException(
                                 message=f"Tool definition for {tool_name} not found",
                                 blame=ErrorBlame.USER_ERROR,
-                                category=ErrorCategory.INVALID_VALUE,
+                                category=ErrorCategory.NOT_APPLICABLE,
                                 target=error_target,
                             )
                     else:
                         raise EvaluationException(
                             message=f"Tool call missing name: {tool_call}",
                             blame=ErrorBlame.USER_ERROR,
-                            category=ErrorCategory.INVALID_VALUE,
+                            category=ErrorCategory.MISSING_FIELD,
                             target=error_target,
                         )
                 else:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index cb1b608dcdb6..5832fdd8ba45 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -162,16 +162,43 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         query = kwargs.get("query")
         response = kwargs.get("response")
         # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
+
+        # Initially try to extract tool calls from the response whether or not tool_calls parameter is provided
         if response:
-            parsed_tool_calls = self._parse_tools_from_response(response)
-            if parsed_tool_calls:
-                tool_calls = parsed_tool_calls
+            try:
+                parsed_tool_calls = self._parse_tools_from_response(response, ensure_arguments=True)
+                if parsed_tool_calls:
+                    tool_calls = parsed_tool_calls
+            except EvaluationException as e:
+                raise EvaluationException(
+                    message=e.message,
+                    category=e.category,
+                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                    blame=ErrorBlame.USER_ERROR,
+                ) from e
 
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_CALLS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
+        
+        # Validate that all tool calls have the "arguments" key
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                if "arguments" not in tool_call:
+                    raise EvaluationException(
+                        message=f"Tool call missing 'arguments' field: {tool_call}",
+                        category=ErrorCategory.MISSING_FIELD,
+                        target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                        blame=ErrorBlame.USER_ERROR,
+                    )
+        
         if not isinstance(tool_definitions, list):
             tool_definitions = [tool_definitions] if tool_definitions else []
 
@@ -179,15 +206,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             needed_tool_definitions = self._extract_needed_tool_definitions(
                 tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
             )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+        except EvaluationException:
+            # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
+            raise
 
+        # Check if no tool definitions were found at all (including built-in tools)
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_DEFINITIONS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         return {
             "query": query,
@@ -227,7 +257,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 raise EvaluationException(
                     message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
                     internal_message="Invalid score value.",
-                    category=ErrorCategory.FAILED_EXECUTION,
+                    category=ErrorCategory.INVALID_VALUE,
                     target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
                     blame=ErrorBlame.SYSTEM_ERROR,
                 )
@@ -271,9 +301,6 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
-        if isinstance(eval_input, dict) and eval_input.get("error_message"):
-            # If there is an error message, return not applicable result
-            return self._not_applicable_result(eval_input.get("error_message"), self.threshold)
         # Do the evaluation
         result = await self._do_eval(eval_input)
         # Return the result
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index 159e8a5d7410..12332d7ee8e1 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -108,11 +108,30 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
 
         # Extract tool calls from response
         if not response:
-            return {"error_message": "Response parameter is required to extract tool calls."}
+            raise EvaluationException(
+                message="Response is required for tool input accuracy evaluation.",
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
+
+        try:
+            tool_calls = self._parse_tools_from_response(response, ensure_arguments=True)
+        except EvaluationException as e:
+            raise EvaluationException(
+                    message=e.message,
+                    category=e.category,
+                    target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                    blame=ErrorBlame.USER_ERROR,
+            ) from e
 
-        tool_calls = self._parse_tools_from_response(response)
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_CALLS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
@@ -125,15 +144,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             needed_tool_definitions = self._extract_needed_tool_definitions(
                 tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
             )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+        except EvaluationException:
+            # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
+            raise
 
+        # Check if no tool definitions were found at all (including built-in tools)
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_DEFINITIONS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         # Reformat agent response with tool calls and results using reformat_agent_response
         agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True)
@@ -177,7 +199,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
                 raise EvaluationException(
                     message=f"Invalid result value: {result}. Expected 0 or 1.",
                     internal_message="Invalid result value.",
-                    category=ErrorCategory.FAILED_EXECUTION,
+                    category=ErrorCategory.INVALID_VALUE,
                     blame=ErrorBlame.SYSTEM_ERROR,
                 )
 
@@ -224,10 +246,6 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
-        if isinstance(eval_input, dict) and eval_input.get("error_message"):
-            # If there is an error message, return not applicable result
-            error_message = eval_input.get("error_message", "Unknown error")
-            return self._not_applicable_result(error_message, 1)
         # Do the evaluation
         result = await self._do_eval(eval_input)
         # Return the result
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
index 48963fa00d58..06fa2927a7f1 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
@@ -136,7 +136,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                 tool_calls = parsed_tool_calls
 
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_CALLS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
@@ -147,15 +152,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             needed_tool_definitions = self._extract_needed_tool_definitions(
                 tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
             )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+        except EvaluationException:
+            # Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
+            raise
 
+        # Check if no tool definitions were found at all (including built-in tools)
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            raise EvaluationException(
+                message=self._NO_TOOL_DEFINITIONS_MESSAGE,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
+                blame=ErrorBlame.USER_ERROR,
+            )
 
         # Extract only tool names from tool calls, removing parameters and results
         tool_names = self._extract_tool_names_from_calls(tool_calls)
@@ -248,9 +256,6 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
-        if isinstance(eval_input, dict) and eval_input.get("error_message"):
-            return self._not_applicable_result(eval_input.get("error_message"), 1)
-
         result = await self._do_eval(eval_input)
 
         return result
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
index f84ad64b53b8..5fc04b6e4214 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -2,7 +2,7 @@
 
 import pytest
 from azure.ai.evaluation import ToolCallAccuracyEvaluator
-from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException
 
 
 # This mock should return a dictionary that mimics the output of the prompty (the _flow call),
@@ -347,17 +347,16 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
                 },
             },
         ]
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-
-        key = ToolCallAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
-        assert result[f"{key}_result"] == "pass"
-        assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-        assert result[f"{key}_details"] == {}
+        
+        # Should throw an exception because buy_jacket definition has invalid type (not "function")
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        # The error should mention the specific tool that's missing
+        # because "another_built_in" is not a valid type (not "function")
+        assert "buy_jacket" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
 
-    def test_evaluate_tools_all_not_applicable(self, mock_model_config):
+    def test_evaluate_tools_missing_tool_definitions_throws_exception(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=flow_side_effect)
 
@@ -387,15 +386,14 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
                 },
             },
         ]
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-
-        key = ToolCallAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
-        assert result[f"{key}_result"] == "pass"
-        assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-        assert result[f"{key}_details"] == {}
+        
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        # The error should mention the specific tool that's missing
+        assert "fetch_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_tools_no_tools(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -420,15 +418,13 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
                 },
             },
         ]
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
-        key = ToolCallAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
-        assert result[f"{key}_result"] == "pass"
-        assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
-        assert result[f"{key}_details"] == {}
+        assert "no tool calls found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_bing_custom_search(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -605,12 +601,14 @@ def test_evaluate_open_api(self, mock_model_config):
             },
         ]
         tool_definitions = []
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
-        key = ToolCallAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
+        # OpenAPI is not a simple built-in tool - it requires a full definition with functions
+        # So calling without tool_definitions should throw an exception
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        assert "openapi" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -726,3 +724,84 @@ def test_evaluate_missing_query(self, mock_model_config):
             evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions)
 
         assert "Query is a required input" in str(exc_info.value)
+
+    def test_evaluate_tools_missing_arguments_in_response(self, mock_model_config):
+        """Test that an exception is raised when response contains tool calls without arguments field."""
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        query = "What's the weather in Paris?"
+        # Response with tool call missing the 'arguments' field
+        response = [
+            {
+                'role': 'assistant',
+                'content': [
+                    {
+                        'type': 'tool_call',
+                        'tool_call_id': 'call_123',
+                        'name': 'fetch_weather',
+                        # Missing 'arguments' field here
+                    }
+                ]
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "fetch_weather",
+                "type": "function",
+                "description": "Fetches the weather information for the specified location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The location to fetch weather for.",
+                        }
+                    },
+                },
+            },
+        ]
+
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
+
+        assert "Tool call missing 'arguments' field" in str(exc_info.value)
+        assert exc_info.value.category is ErrorCategory.MISSING_FIELD
+
+    def test_evaluate_tools_missing_arguments_in_tool_calls_param(self, mock_model_config):
+        """Test that an exception is raised when tool_calls parameter contains calls without arguments field."""
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        query = "What's the weather in Paris?"
+        # Tool calls parameter with missing 'arguments' field
+        tool_calls = [
+            {
+                'type': 'tool_call',
+                'tool_call_id': 'call_123',
+                'name': 'fetch_weather',
+                # Missing 'arguments' field here
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "fetch_weather",
+                "type": "function",
+                "description": "Fetches the weather information for the specified location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The location to fetch weather for.",
+                        }
+                    },
+                },
+            },
+        ]
+
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+
+        assert "Tool call missing 'arguments' field" in str(exc_info.value)
+        assert exc_info.value.category is ErrorCategory.MISSING_FIELD
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
index c41193c489ca..8e989da991ac 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
@@ -6,7 +6,7 @@
 
 import pytest
 from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
-from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException
 
 
 # This mock should return a dictionary that mimics the output of the prompty (the _flow call),
@@ -423,15 +423,14 @@ def test_evaluate_no_tool_calls(self, mock_model_config):
         response = [{"role": "assistant", "content": "I can help you with that."}]
         tool_definitions = [{"name": "get_weather", "type": "function", "description": "Get weather information"}]
 
-        result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
 
-        key = _ToolInputAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert _ToolInputAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{key}_reason"]
+        assert "no tool calls found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
-    def test_evaluate_no_tool_definitions(self, mock_model_config):
+    def test_evaluate_no_tool_definitions_throws_exception(self, mock_model_config):
         """Test evaluation when no tool definitions are provided."""
         evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=flow_side_effect)
@@ -452,15 +451,15 @@ def test_evaluate_no_tool_definitions(self, mock_model_config):
         ]
         tool_definitions = []
 
-        result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
-
-        key = _ToolInputAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert _ToolInputAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{key}_reason"]
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
+        
+        # The error message should mention the specific tool that's missing
+        assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
-    def test_evaluate_missing_tool_definitions(self, mock_model_config):
+    def test_evaluate_missing_tool_definitions_throws_exception(self, mock_model_config):
         """Test evaluation when tool definitions are missing for some tool calls."""
         evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=flow_side_effect)
@@ -481,13 +480,13 @@ def test_evaluate_missing_tool_definitions(self, mock_model_config):
         ]
         tool_definitions = [{"name": "different_function", "type": "function", "description": "A different function"}]
 
-        result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
-
-        key = _ToolInputAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert _ToolInputAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{key}_reason"]
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
+        
+        # The error should mention the specific tool that's missing
+        assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_invalid_result_value(self, mock_model_config):
         """Test that invalid result values raise an exception."""
@@ -533,13 +532,13 @@ def test_evaluate_no_response(self, mock_model_config):
         query = "Get weather"
         tool_definitions = [{"name": "get_weather", "type": "function", "description": "Get weather information"}]
 
-        result = evaluator(query=query, response=None, tool_definitions=tool_definitions)
-
-        key = _ToolInputAccuracyEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert "Response parameter is required" in result[f"{key}_reason"]
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=None, tool_definitions=tool_definitions)
+        
+        # The error message should mention the specific tool that's missing
+        assert "response is required" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.MISSING_FIELD
 
     def test_parameter_extraction_accuracy_calculation(self, mock_model_config):
         """Test the parameter extraction accuracy calculation."""
@@ -695,3 +694,46 @@ def test_evaluate_missing_query(self, mock_model_config):
             evaluator(response=response, tool_definitions=tool_definitions)
 
         assert "Query is a required input" in str(exc_info.value)
+
+    def test_evaluate_missing_arguments_field(self, mock_model_config):
+        """Test that an exception is raised when response contains tool calls without arguments field."""
+        evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        query = "What's the weather in Paris?"
+        # Response with tool call missing the 'arguments' field
+        response = [
+            {
+                'role': 'assistant',
+                'content': [
+                    {
+                        'type': 'tool_call',
+                        'tool_call_id': 'call_123',
+                        'name': 'get_weather',
+                        # Missing 'arguments' field here
+                    }
+                ]
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "get_weather",
+                "type": "function",
+                "description": "Get weather information for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The location to get weather for.",
+                        }
+                    },
+                },
+            },
+        ]
+
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, response=response, tool_definitions=tool_definitions)
+
+        assert "Tool call missing 'arguments' field" in str(exc_info.value)
+        assert exc_info.value.category is ErrorCategory.MISSING_FIELD
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
index bf23c45f5d43..b7e3c36cb6a9 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
@@ -2,7 +2,7 @@
 
 import pytest
 from azure.ai.evaluation._evaluators._tool_selection import _ToolSelectionEvaluator
-from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException
 
 
 # Mock function for Tool Selection evaluator flow side effect
@@ -234,29 +234,35 @@ def test_evaluate_tool_selection_fail_no_tools_selected(self, mock_model_config)
             }
         ]
 
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
-        key = _ToolSelectionEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert f"{key}_reason" in result
+        assert "no tool calls found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
-    def test_evaluate_tool_selection_not_applicable_no_tool_definitions(self, mock_model_config):
+    def test_evaluate_tool_selection_no_tool_definitions_throws_exception(self, mock_model_config):
         evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=tool_selection_flow_side_effect)
 
         query = "What's the weather like today?"
-        tool_calls = []
+        tool_calls = [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_weather",
+                "name": "get_weather",
+                "arguments": {"location": "current"},
+            }
+        ]
         tool_definitions = []
 
-        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
-
-        key = _ToolSelectionEvaluator._RESULT_KEY
-        assert result is not None
-        assert result[key] == "not applicable"
-        assert result[f"{key}_result"] == "pass"
-        assert f"{key}_reason" in result
+        # Expect an exception to be raised
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+        
+        # The error message should mention the specific tool that's missing
+        assert "get_weather" in str(exc_info.value).lower() and "not found" in str(exc_info.value).lower()
+        assert exc_info.value.category is ErrorCategory.NOT_APPLICABLE
 
     def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config):
         evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)