GrayCodeAI
diff --git a/‎src/hawk/tools.py‎
Lines changed: 22 additions & 4 deletions b/‎src/hawk/tools.py‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎src/hawk/workflow.py‎
Lines changed: 1 addition & 3 deletions b/‎src/hawk/workflow.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎tests/test_agent.py‎
Lines changed: 2 additions & 6 deletions b/‎tests/test_agent.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎tests/test_evaluate.py‎
Lines changed: 58 additions & 38 deletions b/‎tests/test_evaluate.py‎
Lines changed: 58 additions & 38 deletions
@@ -138,12 +138,21 @@ def chat_with_tools(
         tool_results = []
         for tc in response.tool_calls:
             tool_name = tc.get("name") if isinstance(tc, dict) else getattr(tc, "name", None)
-            arguments = tc.get("arguments", {}) if isinstance(tc, dict) else getattr(tc, "arguments", {})
+            arguments = (
+                tc.get("arguments", {}) if isinstance(tc, dict) else getattr(tc, "arguments", {})
+            )
             if tool_name and tool_name in tool_map:
                 result = _execute_tool(tool_map[tool_name], arguments)
             else:
                 result = json.dumps({"error": f"Unknown tool: {tool_name}"})
-            tool_results.append({"tool_use_id": tc.get("id", "") if isinstance(tc, dict) else getattr(tc, "id", ""), "content": result})
+            tool_results.append(
+                {
+                    "tool_use_id": tc.get("id", "")
+                    if isinstance(tc, dict)
+                    else getattr(tc, "id", ""),
+                    "content": result,
+                }
+            )
 
         # Send tool results back to continue the conversation.
         response = client.chat(
@@ -201,12 +210,21 @@ async def chat_with_tools_async(
         tool_results = []
         for tc in response.tool_calls:
             tool_name = tc.get("name") if isinstance(tc, dict) else getattr(tc, "name", None)
-            arguments = tc.get("arguments", {}) if isinstance(tc, dict) else getattr(tc, "arguments", {})
+            arguments = (
+                tc.get("arguments", {}) if isinstance(tc, dict) else getattr(tc, "arguments", {})
+            )
             if tool_name and tool_name in tool_map:
                 result = await _execute_tool_async(tool_map[tool_name], arguments)
             else:
                 result = json.dumps({"error": f"Unknown tool: {tool_name}"})
-            tool_results.append({"tool_use_id": tc.get("id", "") if isinstance(tc, dict) else getattr(tc, "id", ""), "content": result})
+            tool_results.append(
+                {
+                    "tool_use_id": tc.get("id", "")
+                    if isinstance(tc, dict)
+                    else getattr(tc, "id", ""),
+                    "content": result,
+                }
+            )
 
         response = await client.chat(
             prompt,
 
@@ -114,9 +114,7 @@ def _run_step_with_timeout(self, s: Step, input_val: Any) -> Any:
             try:
                 return future.result(timeout=s.timeout)
             except concurrent.futures.TimeoutError:
-                raise TimeoutError(
-                    f"Step '{s.name}' timed out after {s.timeout}s"
-                ) from None
+                raise TimeoutError(f"Step '{s.name}' timed out after {s.timeout}s") from None
 
     def _run_step_sync(self, s: Step, input_val: Any) -> Any:
         """Run a single step with optional retry."""
 
@@ -120,9 +120,7 @@ def test_chat_with_tools(self, mock_cwt: MagicMock) -> None:
         agent = Agent(client, cfg)
         resp = agent.chat("find something")
 
-        mock_cwt.assert_called_once_with(
-            client, "find something", [t], max_rounds=3
-        )
+        mock_cwt.assert_called_once_with(client, "find something", [t], max_rounds=3)
         assert resp.response == "search result"
 
     def test_chat_passes_session_id_after_first_turn(self) -> None:
@@ -195,9 +193,7 @@ async def test_chat_with_tools(self, mock_cwt: MagicMock) -> None:
         agent = AsyncAgent(client, cfg)
         resp = await agent.chat("find something")
 
-        mock_cwt.assert_called_once_with(
-            client, "find something", [t], max_rounds=5
-        )
+        mock_cwt.assert_called_once_with(client, "find something", [t], max_rounds=5)
         assert resp.response == "async result"
 
     async def test_chat_session_continuity(self) -> None:
 
@@ -46,9 +46,7 @@ def test_defaults(self) -> None:
         assert result.error is None
 
     def test_with_error(self) -> None:
-        result = EvalResult(
-            task_name="t1", success=False, duration_ms=50.0, error="timeout"
-        )
+        result = EvalResult(task_name="t1", success=False, duration_ms=50.0, error="timeout")
         assert result.success is False
         assert result.error == "timeout"
 
@@ -66,10 +64,16 @@ def test_empty(self) -> None:
         assert br.total_tokens == 0
 
     def test_all_passed(self) -> None:
-        br = BenchmarkResults(results=[
-            EvalResult(task_name="t1", success=True, duration_ms=100.0, tokens_in=10, tokens_out=5),
-            EvalResult(task_name="t2", success=True, duration_ms=200.0, tokens_in=20, tokens_out=10),
-        ])
+        br = BenchmarkResults(
+            results=[
+                EvalResult(
+                    task_name="t1", success=True, duration_ms=100.0, tokens_in=10, tokens_out=5
+                ),
+                EvalResult(
+                    task_name="t2", success=True, duration_ms=200.0, tokens_in=20, tokens_out=10
+                ),
+            ]
+        )
         assert br.total_tasks == 2
         assert br.passed == 2
         assert br.failed == 0
@@ -78,39 +82,49 @@ def test_all_passed(self) -> None:
         assert br.total_tokens == 45
 
     def test_mixed_results(self) -> None:
-        br = BenchmarkResults(results=[
-            EvalResult(task_name="t1", success=True, duration_ms=100.0),
-            EvalResult(task_name="t2", success=False, duration_ms=200.0, error="fail"),
-        ])
+        br = BenchmarkResults(
+            results=[
+                EvalResult(task_name="t1", success=True, duration_ms=100.0),
+                EvalResult(task_name="t2", success=False, duration_ms=200.0, error="fail"),
+            ]
+        )
         assert br.passed == 1
         assert br.failed == 1
         assert br.pass_rate == 0.5
 
     def test_by_category(self) -> None:
         # by_category splits on "/" — names without "/" get "general"
-        br = BenchmarkResults(results=[
-            EvalResult(task_name="math/add", success=True, duration_ms=100.0),
-            EvalResult(task_name="math/mul", success=True, duration_ms=100.0),
-            EvalResult(task_name="general/weather", success=False, duration_ms=100.0),
-        ])
+        br = BenchmarkResults(
+            results=[
+                EvalResult(task_name="math/add", success=True, duration_ms=100.0),
+                EvalResult(task_name="math/mul", success=True, duration_ms=100.0),
+                EvalResult(task_name="general/weather", success=False, duration_ms=100.0),
+            ]
+        )
         cats = br.by_category()
         assert len(cats["math"]) == 2
         assert len(cats["general"]) == 1
 
     def test_summary(self) -> None:
-        br = BenchmarkResults(results=[
-            EvalResult(task_name="t1", success=True, duration_ms=100.0, tokens_in=10, tokens_out=5),
-        ])
+        br = BenchmarkResults(
+            results=[
+                EvalResult(
+                    task_name="t1", success=True, duration_ms=100.0, tokens_in=10, tokens_out=5
+                ),
+            ]
+        )
         summary = br.summary()
         assert "1/1 passed" in summary
         assert "100ms" in summary
         assert "15" in summary  # total tokens
 
     def test_summary_with_failures(self) -> None:
-        br = BenchmarkResults(results=[
-            EvalResult(task_name="t1", success=True, duration_ms=100.0),
-            EvalResult(task_name="t2", success=False, duration_ms=50.0, error="bad output"),
-        ])
+        br = BenchmarkResults(
+            results=[
+                EvalResult(task_name="t1", success=True, duration_ms=100.0),
+                EvalResult(task_name="t2", success=False, duration_ms=50.0, error="bad output"),
+            ]
+        )
         summary = br.summary()
         assert "1/2 passed" in summary
         assert "Failures:" in summary
@@ -178,21 +192,25 @@ def test_no_reset(self) -> None:
 
     def test_validation_pass(self) -> None:
         agent = _make_mock_agent(response="The temperature is 72F")
-        tasks = [EvalTask(
-            name="t1",
-            prompt="weather?",
-            validate=lambda r: "temperature" in r.response,
-        )]
+        tasks = [
+            EvalTask(
+                name="t1",
+                prompt="weather?",
+                validate=lambda r: "temperature" in r.response,
+            )
+        ]
         results = run_benchmark(agent, tasks)
         assert results.passed == 1
 
     def test_validation_fail(self) -> None:
         agent = _make_mock_agent(response="ok")
-        tasks = [EvalTask(
-            name="t1",
-            prompt="weather?",
-            validate=lambda r: "temperature" in r.response,
-        )]
+        tasks = [
+            EvalTask(
+                name="t1",
+                prompt="weather?",
+                validate=lambda r: "temperature" in r.response,
+            )
+        ]
         results = run_benchmark(agent, tasks)
         assert results.passed == 0
         assert results.failed == 1
@@ -253,11 +271,13 @@ async def test_validation_fail(self) -> None:
         resp.turns_taken = 1
         resp.duration = "1.0s"
         agent.chat.return_value = resp
-        tasks = [EvalTask(
-            name="t1",
-            prompt="test",
-            validate=lambda r: "target" in r.response,
-        )]
+        tasks = [
+            EvalTask(
+                name="t1",
+                prompt="test",
+                validate=lambda r: "target" in r.response,
+            )
+        ]
         results = await run_benchmark_async(agent, tasks)
         assert results.failed == 1