From ad115b98d8dab72ce560cbf3d0af4fa167cffea3 Mon Sep 17 00:00:00 2001
From: cemde <42615086+cemde@users.noreply.github.com>
Date: Sat, 23 May 2026 17:19:29 +0200
Subject: [PATCH 1/8] test: add failing repro for judge usage dropped from
 reports (#60)

Tests assert that a judge model registered in setup_evaluators and
invoked during evaluate() has non-zero tokens in both report["usage"]
and Benchmark.usage. Both tests fail on current main because the usage
snapshot is taken before evaluate() runs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../test_benchmark/test_usage_collection.py   | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/tests/test_core/test_benchmark/test_usage_collection.py b/tests/test_core/test_benchmark/test_usage_collection.py
index ac42a92a..eb916904 100644
--- a/tests/test_core/test_benchmark/test_usage_collection.py
+++ b/tests/test_core/test_benchmark/test_usage_collection.py
@@ -96,3 +96,91 @@ def test_usage_property_returns_total(self):
         total = benchmark.usage
         assert total is not None
         # cost may be None if DummyModelAdapter doesn't provide usage
+
+
+# ---------------------------------------------------------------------------
+# Bug #60 reproduction: evaluator (judge) usage was dropped from reports
+# because collect_all_usage() ran before evaluate(). The fixtures below
+# build a benchmark whose evaluator owns a model adapter and only invokes
+# it during __call__, mirroring real LLM judges.
+# ---------------------------------------------------------------------------
+
+
+class _JudgeEvaluator:
+    """Evaluator that invokes a model adapter during __call__.
+
+    Defined as a duck-typed Evaluator (matches the abstract interface) so it
+    can hold a model reference and exercise it at evaluate-time.
+    """
+
+    def __init__(self, task, environment, user, model):
+        self.task = task
+        self.environment = environment
+        self.user = user
+        self.model = model
+
+    def filter_traces(self, traces):
+        return traces
+
+    def __call__(self, traces, final_answer=None):
+        # Invoke the judge model — this is the action whose usage was
+        # previously lost because collect_all_usage() had already run.
+        self.model.chat([{"role": "user", "content": "judge this"}])
+        return {"score": 1.0, "passed": True}
+
+
+def _make_judge_benchmark(judge_usage):
+    """Build a JudgeBenchmark whose setup_evaluators registers a judge model.
+
+    The judge model is created with the provided per-call usage dict. Each
+    call to the model appends one usage record, so a single evaluator
+    invocation produces exactly one record's worth of tokens.
+    """
+    from conftest import DummyBenchmark, DummyModelAdapter
+
+    class JudgeBenchmark(DummyBenchmark):
+        def setup_evaluators(self, environment, task, agents, user, seed_generator):
+            judge_model = DummyModelAdapter(model_id="judge", usage=judge_usage)
+            self.register("models", "judge_model", judge_model)
+            return [_JudgeEvaluator(task, environment, user, model=judge_model)]
+
+    return JudgeBenchmark()
+
+
+@pytest.mark.core
+class TestBenchmarkJudgeUsage:
+    """Regression tests for issue #60: judge token usage must appear in
+    per-task reports and aggregate into ``benchmark.usage``."""
+
+    def test_judge_model_usage_captured_in_report(self):
+        """A judge model invoked during evaluate() has non-zero usage in
+        report['usage']['models']['judge_model']."""
+        judge_usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150}
+        tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
+        benchmark = _make_judge_benchmark(judge_usage)
+
+        reports = benchmark.run(tasks, agent_data={"model": "test"})
+
+        models = reports[0]["usage"]["models"]
+        assert "judge_model" in models, f"judge_model not registered; got: {list(models)}"
+        judge_entry = models["judge_model"]
+        assert judge_entry["input_tokens"] == 100
+        assert judge_entry["output_tokens"] == 50
+
+    def test_judge_model_usage_aggregated_in_benchmark_total(self):
+        """``benchmark.usage`` includes judge tokens, and
+        ``benchmark.usage_by_component`` has a non-zero ``models:judge_model``
+        entry."""
+        judge_usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150}
+        tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
+        benchmark = _make_judge_benchmark(judge_usage)
+
+        benchmark.run(tasks, agent_data={"model": "test"})
+
+        assert benchmark.usage.input_tokens >= 100
+        assert benchmark.usage.output_tokens >= 50
+
+        by_component = benchmark.usage_by_component
+        assert "models:judge_model" in by_component, f"keys: {list(by_component)}"
+        assert by_component["models:judge_model"].input_tokens == 100
+        assert by_component["models:judge_model"].output_tokens == 50

From bc02d473595f3bba43a9c9734a994ac6c8d2e7e8 Mon Sep 17 00:00:00 2001
From: cemde <42615086+cemde@users.noreply.github.com>
Date: Sat, 23 May 2026 17:33:32 +0200
Subject: [PATCH 2/8] test: tighten judge-usage repro per code review (#60)

Address code review on ad115b9: remove implementation-narrative
comments, drop unused state from _JudgeEvaluator, move conftest
import to module scope, condense docstrings, assert total_tokens.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../test_benchmark/test_usage_collection.py   | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/tests/test_core/test_benchmark/test_usage_collection.py b/tests/test_core/test_benchmark/test_usage_collection.py
index eb916904..69052acf 100644
--- a/tests/test_core/test_benchmark/test_usage_collection.py
+++ b/tests/test_core/test_benchmark/test_usage_collection.py
@@ -6,6 +6,7 @@
 
 import pytest
 from maseval import TaskQueue
+from conftest import DummyBenchmark, DummyModelAdapter
 
 
 @pytest.mark.core
@@ -99,32 +100,26 @@ def test_usage_property_returns_total(self):
 
 
 # ---------------------------------------------------------------------------
-# Bug #60 reproduction: evaluator (judge) usage was dropped from reports
-# because collect_all_usage() ran before evaluate(). The fixtures below
-# build a benchmark whose evaluator owns a model adapter and only invokes
-# it during __call__, mirroring real LLM judges.
+# Regression tests for issue #60: judge (evaluator) token usage collection
 # ---------------------------------------------------------------------------
 
 
 class _JudgeEvaluator:
-    """Evaluator that invokes a model adapter during __call__.
+    """Minimal evaluator that invokes a model adapter at evaluate-time.
 
-    Defined as a duck-typed Evaluator (matches the abstract interface) so it
-    can hold a model reference and exercise it at evaluate-time.
+    Not an ``Evaluator`` subclass — it implements the duck-typed
+    interface (``filter_traces`` + ``__call__``) that ``DummyBenchmark.evaluate``
+    iterates over (see ``tests/conftest.py``).
     """
 
-    def __init__(self, task, environment, user, model):
-        self.task = task
-        self.environment = environment
-        self.user = user
+    def __init__(self, model):
         self.model = model
 
     def filter_traces(self, traces):
         return traces
 
     def __call__(self, traces, final_answer=None):
-        # Invoke the judge model — this is the action whose usage was
-        # previously lost because collect_all_usage() had already run.
+        # Invokes the judge at evaluate-time (not at setup).
         self.model.chat([{"role": "user", "content": "judge this"}])
         return {"score": 1.0, "passed": True}
 
@@ -136,21 +131,19 @@ def _make_judge_benchmark(judge_usage):
     call to the model appends one usage record, so a single evaluator
     invocation produces exactly one record's worth of tokens.
     """
-    from conftest import DummyBenchmark, DummyModelAdapter
 
     class JudgeBenchmark(DummyBenchmark):
         def setup_evaluators(self, environment, task, agents, user, seed_generator):
             judge_model = DummyModelAdapter(model_id="judge", usage=judge_usage)
             self.register("models", "judge_model", judge_model)
-            return [_JudgeEvaluator(task, environment, user, model=judge_model)]
+            return [_JudgeEvaluator(model=judge_model)]
 
     return JudgeBenchmark()
 
 
 @pytest.mark.core
 class TestBenchmarkJudgeUsage:
-    """Regression tests for issue #60: judge token usage must appear in
-    per-task reports and aggregate into ``benchmark.usage``."""
+    """Tests that judge token usage reaches both per-task reports and ``benchmark.usage``."""
 
     def test_judge_model_usage_captured_in_report(self):
         """A judge model invoked during evaluate() has non-zero usage in
@@ -166,6 +159,7 @@ def test_judge_model_usage_captured_in_report(self):
         judge_entry = models["judge_model"]
         assert judge_entry["input_tokens"] == 100
         assert judge_entry["output_tokens"] == 50
+        assert judge_entry["total_tokens"] == 150
 
     def test_judge_model_usage_aggregated_in_benchmark_total(self):
         """``benchmark.usage`` includes judge tokens, and

From da99acd287da8e75ee8e08fa980eda34731ae59f Mon Sep 17 00:00:00 2001
From: cemde <42615086+cemde@users.noreply.github.com>
Date: Sat, 23 May 2026 17:36:31 +0200
Subject: [PATCH 3/8] fix(core): collect usage after evaluate() so judge tokens
 are captured (#60)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In Benchmark._execute_task_repetition the usage snapshot was taken in
step 3 — before evaluate() ran. Models registered in setup_evaluators
(LLM judges) were in the registry but had empty _usage_records at that
point, so their tokens showed up as zero in report["usage"] and were
never folded into Benchmark.usage / usage_by_component.

Move collect_all_usage() into its own step 4b that runs after
evaluate(). Configs and traces remain in step 3 because evaluate()
consumes traces. The change is a pure lifecycle reordering inside one
worker thread; no new shared state, no new locks, no API/schema change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 maseval/core/benchmark.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
index 68ce11fd..1fb04e10 100644
--- a/maseval/core/benchmark.py
+++ b/maseval/core/benchmark.py
@@ -1254,12 +1254,14 @@ def _execute_task_repetition(
 
             final_answers = None
 
-        # 3. Collect traces, configs, and usage (always attempt this)
+        # 3. Collect traces and configs (always attempt this).
+        # Usage is collected later (step 4b) so that token usage from
+        # evaluator-owned models (LLM judges) is captured — they only
+        # invoke their model during evaluate() at step 4.
         execution_usage: Optional[Dict[str, Any]] = None
         try:
             execution_configs = self.collect_all_configs()
             execution_traces = self.collect_all_traces()
-            execution_usage = self.collect_all_usage()
             # Store in context for potential timeout errors
             context.set_collected_traces(execution_traces)
         except Exception as e:
@@ -1272,11 +1274,6 @@ def _execute_task_repetition(
                 "error": f"Failed to collect traces: {e}",
                 "error_type": type(e).__name__,
             }
-            if execution_usage is None:
-                execution_usage = {
-                    "error": f"Failed to collect usage: {e}",
-                    "error_type": type(e).__name__,
-                }
 
         # 4. Evaluate (skip if task execution failed)
         if execution_status == TaskExecutionStatus.SUCCESS:
@@ -1311,6 +1308,18 @@ def _execute_task_repetition(
             # Task execution failed, so skip evaluation
             eval_results = None
 
+        # 4b. Collect usage (after evaluate() so judge/evaluator-owned
+        # model token usage is included). This is the only point that
+        # folds per-component usage into Benchmark.usage and
+        # Benchmark.usage_by_component (see ComponentRegistry.collect_usage).
+        try:
+            execution_usage = self.collect_all_usage()
+        except Exception as e:
+            execution_usage = {
+                "error": f"Failed to collect usage: {e}",
+                "error_type": type(e).__name__,
+            }
+
         # 5. Build report — all keys always present for consistent schema
         report = self._build_report(
             task,

From 18d96459fb437cadfa29771ae0b00463f7f17aff Mon Sep 17 00:00:00 2001
From: cemde <42615086+cemde@users.noreply.github.com>
Date: Sat, 23 May 2026 17:51:32 +0200
Subject: [PATCH 4/8] fix(core): tighten step-comment cleanup per code review
 (#60)

Address code review on da99acd: shorten verbose step-3 and step-5
header comments (now single lines describing only the load-bearing
"why"), and renumber the new usage-collection block from "4b" to
"5", shifting build-report to "6", so the step sequence is uniform.

No behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 maseval/core/benchmark.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
index 1fb04e10..619bd139 100644
--- a/maseval/core/benchmark.py
+++ b/maseval/core/benchmark.py
@@ -1254,10 +1254,7 @@ def _execute_task_repetition(
 
             final_answers = None
 
-        # 3. Collect traces and configs (always attempt this).
-        # Usage is collected later (step 4b) so that token usage from
-        # evaluator-owned models (LLM judges) is captured — they only
-        # invoke their model during evaluate() at step 4.
+        # 3. Collect traces and configs (always attempt this)
         execution_usage: Optional[Dict[str, Any]] = None
         try:
             execution_configs = self.collect_all_configs()
@@ -1308,10 +1305,7 @@ def _execute_task_repetition(
             # Task execution failed, so skip evaluation
             eval_results = None
 
-        # 4b. Collect usage (after evaluate() so judge/evaluator-owned
-        # model token usage is included). This is the only point that
-        # folds per-component usage into Benchmark.usage and
-        # Benchmark.usage_by_component (see ComponentRegistry.collect_usage).
+        # 5. Collect usage after evaluate() so judge/evaluator-owned model tokens are captured.
         try:
             execution_usage = self.collect_all_usage()
         except Exception as e:
@@ -1320,7 +1314,7 @@ def _execute_task_repetition(
                 "error_type": type(e).__name__,
             }
 
-        # 5. Build report — all keys always present for consistent schema
+        # 6. Build report — all keys always present for consistent schema
         report = self._build_report(
             task,
             repeat_idx,

From a928c54f5a720408411855aacfa927907188488d Mon Sep 17 00:00:00 2001
From: cemde <42615086+cemde@users.noreply.github.com>
Date: Sat, 23 May 2026 17:53:22 +0200
Subject: [PATCH 5/8] test: assert agent usage survives evaluator and execution
 failures (#60)

Locks in that the post-evaluate usage collection (step 5) still runs
on the eval-failure and execution-failure paths, so pre-failure
agent-side usage lands in the report rather than the usage field
becoming an error dict.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../test_benchmark/test_usage_collection.py   | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/tests/test_core/test_benchmark/test_usage_collection.py b/tests/test_core/test_benchmark/test_usage_collection.py
index 69052acf..c0a07797 100644
--- a/tests/test_core/test_benchmark/test_usage_collection.py
+++ b/tests/test_core/test_benchmark/test_usage_collection.py
@@ -178,3 +178,74 @@ def test_judge_model_usage_aggregated_in_benchmark_total(self):
         assert "models:judge_model" in by_component, f"keys: {list(by_component)}"
         assert by_component["models:judge_model"].input_tokens == 100
         assert by_component["models:judge_model"].output_tokens == 50
+
+    def test_agent_usage_captured_when_evaluation_raises(self):
+        """When evaluate() raises and fail_on_evaluation_error=False, the
+        report still carries a real usage dict (step 5 runs after step 4)."""
+
+        class RaisingEvaluator:
+            def __init__(self, task, environment, user):
+                self.task = task
+
+            def filter_traces(self, traces):
+                return traces
+
+            def __call__(self, traces, final_answer=None):
+                raise RuntimeError("boom — simulated evaluator failure")
+
+        class RaisingJudgeBenchmark(DummyBenchmark):
+            def setup_evaluators(self, environment, task, agents, user, seed_generator):
+                # Register an agent-side model with usage so we can assert
+                # that pre-evaluate usage still survives the eval failure.
+                agent_model = DummyModelAdapter(
+                    model_id="agent_model",
+                    usage={"input_tokens": 42, "output_tokens": 7, "total_tokens": 49},
+                )
+                self.register("models", "agent_model", agent_model)
+                # Drive the model once so it has a usage record.
+                agent_model.chat([{"role": "user", "content": "hi"}])
+                return [RaisingEvaluator(task, environment, user)]
+
+        tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
+        benchmark = RaisingJudgeBenchmark(fail_on_evaluation_error=False)
+
+        reports = benchmark.run(tasks, agent_data={"model": "test"})
+
+        report = reports[0]
+        assert report["status"] == "evaluation_failed"
+        usage = report["usage"]
+        assert isinstance(usage, dict)
+        assert "error" not in usage, f"usage became an error dict: {usage}"
+        assert usage["models"]["agent_model"]["input_tokens"] == 42
+
+    def test_agent_usage_captured_when_execution_raises(self):
+        """When run_agents raises (execution failure) and fail_on_task_error
+        is False, the report still carries a real usage dict. Evaluate is
+        skipped, but step 5 still runs."""
+        from maseval.core.exceptions import AgentError
+
+        class FailingAgentBenchmark(DummyBenchmark):
+            def setup_agents(self, agent_data, environment, task, user, seed_generator):
+                agent_model = DummyModelAdapter(
+                    model_id="agent_model",
+                    usage={"input_tokens": 11, "output_tokens": 3, "total_tokens": 14},
+                )
+                self.register("models", "agent_model", agent_model)
+                # Drive the model once so it has a usage record.
+                agent_model.chat([{"role": "user", "content": "hi"}])
+                return super().setup_agents(agent_data, environment, task, user, seed_generator)
+
+            def run_agents(self, agents, task, environment, query):
+                raise AgentError("simulated agent failure", component="agent")
+
+        tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
+        benchmark = FailingAgentBenchmark(fail_on_task_error=False)
+
+        reports = benchmark.run(tasks, agent_data={"model": "test"})
+
+        report = reports[0]
+        assert report["status"] == "agent_error"
+        usage = report["usage"]
+        assert isinstance(usage, dict)
+        assert "error" not in usage, f"usage became an error dict: {usage}"
+        assert usage["models"]["agent_model"]["input_tokens"] == 11

From 7341950112428d4de5ab81e913b745c46f8fe43f Mon Sep 17 00:00:00 2001
From: cemde <42615086+cemde@users.noreply.github.com>
Date: Sat, 23 May 2026 18:07:48 +0200
Subject: [PATCH 6/8] test: tighten failure-path regression tests per code
 review (#60)

Address code review on a928c54: assert output_tokens (not just
input_tokens), drop the redundant isinstance(usage, dict) check,
mark RaisingEvaluator's unused constructor args explicit with _,
and move the AgentError import to module scope.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_core/test_benchmark/test_usage_collection.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_core/test_benchmark/test_usage_collection.py b/tests/test_core/test_benchmark/test_usage_collection.py
index c0a07797..36cf74fc 100644
--- a/tests/test_core/test_benchmark/test_usage_collection.py
+++ b/tests/test_core/test_benchmark/test_usage_collection.py
@@ -6,6 +6,7 @@
 
 import pytest
 from maseval import TaskQueue
+from maseval.core.exceptions import AgentError
 from conftest import DummyBenchmark, DummyModelAdapter
 
 
@@ -185,7 +186,7 @@ def test_agent_usage_captured_when_evaluation_raises(self):
 
         class RaisingEvaluator:
             def __init__(self, task, environment, user):
-                self.task = task
+                _ = task, environment, user
 
             def filter_traces(self, traces):
                 return traces
@@ -214,15 +215,14 @@ def setup_evaluators(self, environment, task, agents, user, seed_generator):
         report = reports[0]
         assert report["status"] == "evaluation_failed"
         usage = report["usage"]
-        assert isinstance(usage, dict)
         assert "error" not in usage, f"usage became an error dict: {usage}"
         assert usage["models"]["agent_model"]["input_tokens"] == 42
+        assert usage["models"]["agent_model"]["output_tokens"] == 7
 
     def test_agent_usage_captured_when_execution_raises(self):
         """When run_agents raises (execution failure) and fail_on_task_error
         is False, the report still carries a real usage dict. Evaluate is
         skipped, but step 5 still runs."""
-        from maseval.core.exceptions import AgentError
 
         class FailingAgentBenchmark(DummyBenchmark):
             def setup_agents(self, agent_data, environment, task, user, seed_generator):
@@ -246,6 +246,6 @@ def run_agents(self, agents, task, environment, query):
         report = reports[0]
         assert report["status"] == "agent_error"
         usage = report["usage"]
-        assert isinstance(usage, dict)
         assert "error" not in usage, f"usage became an error dict: {usage}"
         assert usage["models"]["agent_model"]["input_tokens"] == 11
+        assert usage["models"]["agent_model"]["output_tokens"] == 3

From cfb0eea4bb1f0b9ed272ac3db2576c21cffb98a4 Mon Sep 17 00:00:00 2001
From: cemde <42615086+cemde@users.noreply.github.com>
Date: Sat, 23 May 2026 18:11:12 +0200
Subject: [PATCH 7/8] docs: changelog entry for judge usage fix (#60)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e333b1c9..8b1b226b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed MACS real-data tests passing `{"environment_data": task.environment_data}` instead of `task.environment_data` directly, which caused `setup_state` to silently receive an empty tools list. (PR: #58)
 - Benchmark reports from `Benchmark.run()` now have a consistent schema across every outcome. Setup failures, setup timeouts, and unexpected worker failures in parallel runs previously produced reports missing the `usage` and `task` keys (with empty `traces`/`config`). Every report now always includes `task_id`, `repeat_idx`, `status`, `error`, `traces`, `config`, `usage`, `eval`, and `task`, and `report["error"]` is always populated whenever `status` is not `SUCCESS`. (PR: #61)
 - `fail_on_setup_error`, `fail_on_task_error`, and `fail_on_evaluation_error` now abort a parallel `Benchmark.run()` the same way they abort a sequential run. Previously a parallel run swallowed the failure into a degraded report and kept going. (PR: #61)
+- Token usage and cost for LLM judges and other evaluator-owned models are now correctly captured in per-task reports (`report["usage"]`) and in `benchmark.usage` / `benchmark.usage_by_component`. Previously these entries showed zero tokens because the usage snapshot was taken before evaluators ran. Affects every benchmark that registers a model in `setup_evaluators` (e.g. ConVerse, MultiAgentBench). (PR: #PR_NUMBER_PLACEHOLDER)
 
 ### Removed
 

From a5fa7048da590a9cbd1c620cce00cce1b0092e77 Mon Sep 17 00:00:00 2001
From: cemde <42615086+cemde@users.noreply.github.com>
Date: Sat, 23 May 2026 18:12:52 +0200
Subject: [PATCH 8/8] docs: fill in PR number for changelog entry

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b1b226b..adad9eed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,7 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed MACS real-data tests passing `{"environment_data": task.environment_data}` instead of `task.environment_data` directly, which caused `setup_state` to silently receive an empty tools list. (PR: #58)
 - Benchmark reports from `Benchmark.run()` now have a consistent schema across every outcome. Setup failures, setup timeouts, and unexpected worker failures in parallel runs previously produced reports missing the `usage` and `task` keys (with empty `traces`/`config`). Every report now always includes `task_id`, `repeat_idx`, `status`, `error`, `traces`, `config`, `usage`, `eval`, and `task`, and `report["error"]` is always populated whenever `status` is not `SUCCESS`. (PR: #61)
 - `fail_on_setup_error`, `fail_on_task_error`, and `fail_on_evaluation_error` now abort a parallel `Benchmark.run()` the same way they abort a sequential run. Previously a parallel run swallowed the failure into a degraded report and kept going. (PR: #61)
-- Token usage and cost for LLM judges and other evaluator-owned models are now correctly captured in per-task reports (`report["usage"]`) and in `benchmark.usage` / `benchmark.usage_by_component`. Previously these entries showed zero tokens because the usage snapshot was taken before evaluators ran. Affects every benchmark that registers a model in `setup_evaluators` (e.g. ConVerse, MultiAgentBench). (PR: #PR_NUMBER_PLACEHOLDER)
+- Token usage and cost for LLM judges and other evaluator-owned models are now correctly captured in per-task reports (`report["usage"]`) and in `benchmark.usage` / `benchmark.usage_by_component`. Previously these entries showed zero tokens because the usage snapshot was taken before evaluators ran. Affects every benchmark that registers a model in `setup_evaluators` (e.g. ConVerse, MultiAgentBench). (PR: #63)
 
 ### Removed