Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
aggregated["evaluation_per_turn"] = evaluation_per_turn
return aggregated

def _parse_tools_from_response(self, response):
def _parse_tools_from_response(self, response, ensure_arguments=False):
"""Parse the response to extract tool calls and results.
:param response: The response to parse.
:type response: Union[str, List[dict]]
Expand All @@ -505,6 +505,11 @@ def _parse_tools_from_response(self, response):
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
for content_item in message.get("content"):
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
if ensure_arguments and "arguments" not in content_item:
raise EvaluationException(
message=f"Tool call missing 'arguments' field: {content_item}",
category=ErrorCategory.MISSING_FIELD,
)
tool_calls.append(copy.deepcopy(content_item))

# Extract tool results from tool messages
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -294,14 +294,14 @@ def _extract_needed_tool_definitions(
raise EvaluationException(
message=f"Tool definition for {tool_name} not found",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
category=ErrorCategory.NOT_APPLICABLE,
target=error_target,
)
else:
raise EvaluationException(
message=f"Tool call missing name: {tool_call}",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
category=ErrorCategory.MISSING_FIELD,
target=error_target,
)
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,32 +162,62 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
query = kwargs.get("query")
response = kwargs.get("response")
# TODO : Support classes that represents tool calls, messages etc once client side definitions are available

# Initially try to extract tool calls from the response whether or not tool_calls parameter is provided
if response:
parsed_tool_calls = self._parse_tools_from_response(response)
if parsed_tool_calls:
tool_calls = parsed_tool_calls
try:
parsed_tool_calls = self._parse_tools_from_response(response, ensure_arguments=True)
if parsed_tool_calls:
tool_calls = parsed_tool_calls
except EvaluationException as e:
raise EvaluationException(
message=e.message,
category=e.category,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
) from e

if not tool_calls:
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
raise EvaluationException(
message=self._NO_TOOL_CALLS_MESSAGE,
category=ErrorCategory.NOT_APPLICABLE,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
)

if not isinstance(tool_calls, list):
tool_calls = [tool_calls]

# Validate that all tool calls have the "arguments" key
for tool_call in tool_calls:
if isinstance(tool_call, dict):
if "arguments" not in tool_call:
raise EvaluationException(
message=f"Tool call missing 'arguments' field: {tool_call}",
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
)

if not isinstance(tool_definitions, list):
tool_definitions = [tool_definitions] if tool_definitions else []

try:
needed_tool_definitions = self._extract_needed_tool_definitions(
tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
)
except EvaluationException as e:
# Check if this is because no tool definitions were provided at all
if len(tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
else:
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
except EvaluationException:
# Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
raise

# Check if no tool definitions were found at all (including built-in tools)
if len(needed_tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
raise EvaluationException(
message=self._NO_TOOL_DEFINITIONS_MESSAGE,
category=ErrorCategory.NOT_APPLICABLE,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
)

return {
"query": query,
Expand Down Expand Up @@ -227,7 +257,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
raise EvaluationException(
message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
internal_message="Invalid score value.",
category=ErrorCategory.FAILED_EXECUTION,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
blame=ErrorBlame.SYSTEM_ERROR,
)
Expand Down Expand Up @@ -271,9 +301,6 @@ async def _real_call(self, **kwargs):
"""
# Convert inputs into list of evaluable inputs.
eval_input = self._convert_kwargs_to_eval_input(**kwargs)
if isinstance(eval_input, dict) and eval_input.get("error_message"):
# If there is an error message, return not applicable result
return self._not_applicable_result(eval_input.get("error_message"), self.threshold)
# Do the evaluation
result = await self._do_eval(eval_input)
# Return the result
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,30 @@ def _convert_kwargs_to_eval_input(self, **kwargs):

# Extract tool calls from response
if not response:
return {"error_message": "Response parameter is required to extract tool calls."}
raise EvaluationException(
message="Response is required for tool input accuracy evaluation.",
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
)

try:
tool_calls = self._parse_tools_from_response(response, ensure_arguments=True)
except EvaluationException as e:
raise EvaluationException(
message=e.message,
category=e.category,
target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
) from e

tool_calls = self._parse_tools_from_response(response)
if not tool_calls:
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
raise EvaluationException(
message=self._NO_TOOL_CALLS_MESSAGE,
category=ErrorCategory.NOT_APPLICABLE,
target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
)

if not isinstance(tool_calls, list):
tool_calls = [tool_calls]
Expand All @@ -125,15 +144,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
needed_tool_definitions = self._extract_needed_tool_definitions(
tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
)
except EvaluationException as e:
# Check if this is because no tool definitions were provided at all
if len(tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
else:
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
except EvaluationException:
# Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
raise

# Check if no tool definitions were found at all (including built-in tools)
if len(needed_tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
raise EvaluationException(
message=self._NO_TOOL_DEFINITIONS_MESSAGE,
category=ErrorCategory.NOT_APPLICABLE,
target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
)

# Reformat agent response with tool calls and results using reformat_agent_response
agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True)
Expand Down Expand Up @@ -177,7 +199,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
raise EvaluationException(
message=f"Invalid result value: {result}. Expected 0 or 1.",
internal_message="Invalid result value.",
category=ErrorCategory.FAILED_EXECUTION,
category=ErrorCategory.INVALID_VALUE,
blame=ErrorBlame.SYSTEM_ERROR,
)

Expand Down Expand Up @@ -224,10 +246,6 @@ async def _real_call(self, **kwargs):
"""
# Convert inputs into list of evaluable inputs.
eval_input = self._convert_kwargs_to_eval_input(**kwargs)
if isinstance(eval_input, dict) and eval_input.get("error_message"):
# If there is an error message, return not applicable result
error_message = eval_input.get("error_message", "Unknown error")
return self._not_applicable_result(error_message, 1)
# Do the evaluation
result = await self._do_eval(eval_input)
# Return the result
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,12 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
tool_calls = parsed_tool_calls

if not tool_calls:
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
raise EvaluationException(
message=self._NO_TOOL_CALLS_MESSAGE,
category=ErrorCategory.NOT_APPLICABLE,
target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
)

if not isinstance(tool_calls, list):
tool_calls = [tool_calls]
Expand All @@ -147,15 +152,18 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
needed_tool_definitions = self._extract_needed_tool_definitions(
tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
)
except EvaluationException as e:
# Check if this is because no tool definitions were provided at all
if len(tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
else:
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
except EvaluationException:
# Re-raise the exception from _extract_needed_tool_definitions as it already has specific error details
raise

# Check if no tool definitions were found at all (including built-in tools)
if len(needed_tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
raise EvaluationException(
message=self._NO_TOOL_DEFINITIONS_MESSAGE,
category=ErrorCategory.NOT_APPLICABLE,
target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
blame=ErrorBlame.USER_ERROR,
)

# Extract only tool names from tool calls, removing parameters and results
tool_names = self._extract_tool_names_from_calls(tool_calls)
Expand Down Expand Up @@ -248,9 +256,6 @@ async def _real_call(self, **kwargs):
"""
# Convert inputs into list of evaluable inputs.
eval_input = self._convert_kwargs_to_eval_input(**kwargs)
if isinstance(eval_input, dict) and eval_input.get("error_message"):
return self._not_applicable_result(eval_input.get("error_message"), 1)

result = await self._do_eval(eval_input)

return result
Expand Down
Loading
Loading