From ad115b98d8dab72ce560cbf3d0af4fa167cffea3 Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sat, 23 May 2026 17:19:29 +0200 Subject: [PATCH 1/8] test: add failing repro for judge usage dropped from reports (#60) Tests assert that a judge model registered in setup_evaluators and invoked during evaluate() has non-zero tokens in both report["usage"] and Benchmark.usage. Both tests fail on current main because the usage snapshot is taken before evaluate() runs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test_benchmark/test_usage_collection.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/tests/test_core/test_benchmark/test_usage_collection.py b/tests/test_core/test_benchmark/test_usage_collection.py index ac42a92a..eb916904 100644 --- a/tests/test_core/test_benchmark/test_usage_collection.py +++ b/tests/test_core/test_benchmark/test_usage_collection.py @@ -96,3 +96,91 @@ def test_usage_property_returns_total(self): total = benchmark.usage assert total is not None # cost may be None if DummyModelAdapter doesn't provide usage + + +# --------------------------------------------------------------------------- +# Bug #60 reproduction: evaluator (judge) usage was dropped from reports +# because collect_all_usage() ran before evaluate(). The fixtures below +# build a benchmark whose evaluator owns a model adapter and only invokes +# it during __call__, mirroring real LLM judges. +# --------------------------------------------------------------------------- + + +class _JudgeEvaluator: + """Evaluator that invokes a model adapter during __call__. + + Defined as a duck-typed Evaluator (matches the abstract interface) so it + can hold a model reference and exercise it at evaluate-time. + """ + + def __init__(self, task, environment, user, model): + self.task = task + self.environment = environment + self.user = user + self.model = model + + def filter_traces(self, traces): + return traces + + def __call__(self, traces, final_answer=None): + # Invoke the judge model — this is the action whose usage was + # previously lost because collect_all_usage() had already run. + self.model.chat([{"role": "user", "content": "judge this"}]) + return {"score": 1.0, "passed": True} + + +def _make_judge_benchmark(judge_usage): + """Build a JudgeBenchmark whose setup_evaluators registers a judge model. + + The judge model is created with the provided per-call usage dict. Each + call to the model appends one usage record, so a single evaluator + invocation produces exactly one record's worth of tokens. + """ + from conftest import DummyBenchmark, DummyModelAdapter + + class JudgeBenchmark(DummyBenchmark): + def setup_evaluators(self, environment, task, agents, user, seed_generator): + judge_model = DummyModelAdapter(model_id="judge", usage=judge_usage) + self.register("models", "judge_model", judge_model) + return [_JudgeEvaluator(task, environment, user, model=judge_model)] + + return JudgeBenchmark() + + +@pytest.mark.core +class TestBenchmarkJudgeUsage: + """Regression tests for issue #60: judge token usage must appear in + per-task reports and aggregate into ``benchmark.usage``.""" + + def test_judge_model_usage_captured_in_report(self): + """A judge model invoked during evaluate() has non-zero usage in + report['usage']['models']['judge_model'].""" + judge_usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} + tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}]) + benchmark = _make_judge_benchmark(judge_usage) + + reports = benchmark.run(tasks, agent_data={"model": "test"}) + + models = reports[0]["usage"]["models"] + assert "judge_model" in models, f"judge_model not registered; got: {list(models)}" + judge_entry = models["judge_model"] + assert judge_entry["input_tokens"] == 100 + assert judge_entry["output_tokens"] == 50 + + def test_judge_model_usage_aggregated_in_benchmark_total(self): + """``benchmark.usage`` includes judge tokens, and + ``benchmark.usage_by_component`` has a non-zero ``models:judge_model`` + entry.""" + judge_usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} + tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}]) + benchmark = _make_judge_benchmark(judge_usage) + + benchmark.run(tasks, agent_data={"model": "test"}) + + assert benchmark.usage.input_tokens >= 100 + assert benchmark.usage.output_tokens >= 50 + + by_component = benchmark.usage_by_component + assert "models:judge_model" in by_component, f"keys: {list(by_component)}" + assert by_component["models:judge_model"].input_tokens == 100 + assert by_component["models:judge_model"].output_tokens == 50 From bc02d473595f3bba43a9c9734a994ac6c8d2e7e8 Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sat, 23 May 2026 17:33:32 +0200 Subject: [PATCH 2/8] test: tighten judge-usage repro per code review (#60) Address code review on ad115b9: remove implementation-narrative comments, drop unused state from _JudgeEvaluator, move conftest import to module scope, condense docstrings, assert total_tokens. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test_benchmark/test_usage_collection.py | 28 ++++++++----------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/tests/test_core/test_benchmark/test_usage_collection.py b/tests/test_core/test_benchmark/test_usage_collection.py index eb916904..69052acf 100644 --- a/tests/test_core/test_benchmark/test_usage_collection.py +++ b/tests/test_core/test_benchmark/test_usage_collection.py @@ -6,6 +6,7 @@ import pytest from maseval import TaskQueue +from conftest import DummyBenchmark, DummyModelAdapter @pytest.mark.core @@ -99,32 +100,26 @@ def test_usage_property_returns_total(self): # --------------------------------------------------------------------------- -# Bug #60 reproduction: evaluator (judge) usage was dropped from reports -# because collect_all_usage() ran before evaluate(). The fixtures below -# build a benchmark whose evaluator owns a model adapter and only invokes -# it during __call__, mirroring real LLM judges. +# Regression tests for issue #60: judge (evaluator) token usage collection # --------------------------------------------------------------------------- class _JudgeEvaluator: - """Evaluator that invokes a model adapter during __call__. + """Minimal evaluator that invokes a model adapter at evaluate-time. - Defined as a duck-typed Evaluator (matches the abstract interface) so it - can hold a model reference and exercise it at evaluate-time. + Not an ``Evaluator`` subclass — it implements the duck-typed + interface (``filter_traces`` + ``__call__``) that ``DummyBenchmark.evaluate`` + iterates over (see ``tests/conftest.py``). """ - def __init__(self, task, environment, user, model): - self.task = task - self.environment = environment - self.user = user + def __init__(self, model): self.model = model def filter_traces(self, traces): return traces def __call__(self, traces, final_answer=None): - # Invoke the judge model — this is the action whose usage was - # previously lost because collect_all_usage() had already run. + # Invokes the judge at evaluate-time (not at setup). self.model.chat([{"role": "user", "content": "judge this"}]) return {"score": 1.0, "passed": True} @@ -136,21 +131,19 @@ def _make_judge_benchmark(judge_usage): call to the model appends one usage record, so a single evaluator invocation produces exactly one record's worth of tokens. """ - from conftest import DummyBenchmark, DummyModelAdapter class JudgeBenchmark(DummyBenchmark): def setup_evaluators(self, environment, task, agents, user, seed_generator): judge_model = DummyModelAdapter(model_id="judge", usage=judge_usage) self.register("models", "judge_model", judge_model) - return [_JudgeEvaluator(task, environment, user, model=judge_model)] + return [_JudgeEvaluator(model=judge_model)] return JudgeBenchmark() @pytest.mark.core class TestBenchmarkJudgeUsage: - """Regression tests for issue #60: judge token usage must appear in - per-task reports and aggregate into ``benchmark.usage``.""" + """Tests that judge token usage reaches both per-task reports and ``benchmark.usage``.""" def test_judge_model_usage_captured_in_report(self): """A judge model invoked during evaluate() has non-zero usage in @@ -166,6 +159,7 @@ def test_judge_model_usage_captured_in_report(self): judge_entry = models["judge_model"] assert judge_entry["input_tokens"] == 100 assert judge_entry["output_tokens"] == 50 + assert judge_entry["total_tokens"] == 150 def test_judge_model_usage_aggregated_in_benchmark_total(self): """``benchmark.usage`` includes judge tokens, and From da99acd287da8e75ee8e08fa980eda34731ae59f Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sat, 23 May 2026 17:36:31 +0200 Subject: [PATCH 3/8] fix(core): collect usage after evaluate() so judge tokens are captured (#60) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Benchmark._execute_task_repetition the usage snapshot was taken in step 3 — before evaluate() ran. Models registered in setup_evaluators (LLM judges) were in the registry but had empty _usage_records at that point, so their tokens showed up as zero in report["usage"] and were never folded into Benchmark.usage / usage_by_component. Move collect_all_usage() into its own step 4b that runs after evaluate(). Configs and traces remain in step 3 because evaluate() consumes traces. The change is a pure lifecycle reordering inside one worker thread; no new shared state, no new locks, no API/schema change. Co-Authored-By: Claude Opus 4.7 (1M context) --- maseval/core/benchmark.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py index 68ce11fd..1fb04e10 100644 --- a/maseval/core/benchmark.py +++ b/maseval/core/benchmark.py @@ -1254,12 +1254,14 @@ def _execute_task_repetition( final_answers = None - # 3. Collect traces, configs, and usage (always attempt this) + # 3. Collect traces and configs (always attempt this). + # Usage is collected later (step 4b) so that token usage from + # evaluator-owned models (LLM judges) is captured — they only + # invoke their model during evaluate() at step 4. execution_usage: Optional[Dict[str, Any]] = None try: execution_configs = self.collect_all_configs() execution_traces = self.collect_all_traces() - execution_usage = self.collect_all_usage() # Store in context for potential timeout errors context.set_collected_traces(execution_traces) except Exception as e: @@ -1272,11 +1274,6 @@ def _execute_task_repetition( "error": f"Failed to collect traces: {e}", "error_type": type(e).__name__, } - if execution_usage is None: - execution_usage = { - "error": f"Failed to collect usage: {e}", - "error_type": type(e).__name__, - } # 4. Evaluate (skip if task execution failed) if execution_status == TaskExecutionStatus.SUCCESS: @@ -1311,6 +1308,18 @@ def _execute_task_repetition( # Task execution failed, so skip evaluation eval_results = None + # 4b. Collect usage (after evaluate() so judge/evaluator-owned + # model token usage is included). This is the only point that + # folds per-component usage into Benchmark.usage and + # Benchmark.usage_by_component (see ComponentRegistry.collect_usage). + try: + execution_usage = self.collect_all_usage() + except Exception as e: + execution_usage = { + "error": f"Failed to collect usage: {e}", + "error_type": type(e).__name__, + } + # 5. Build report — all keys always present for consistent schema report = self._build_report( task, From 18d96459fb437cadfa29771ae0b00463f7f17aff Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sat, 23 May 2026 17:51:32 +0200 Subject: [PATCH 4/8] fix(core): tighten step-comment cleanup per code review (#60) Address code review on da99acd: shorten verbose step-3 and step-5 header comments (now single lines describing only the load-bearing "why"), and renumber the new usage-collection block from "4b" to "5", shifting build-report to "6", so the step sequence is uniform. No behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) --- maseval/core/benchmark.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py index 1fb04e10..619bd139 100644 --- a/maseval/core/benchmark.py +++ b/maseval/core/benchmark.py @@ -1254,10 +1254,7 @@ def _execute_task_repetition( final_answers = None - # 3. Collect traces and configs (always attempt this). - # Usage is collected later (step 4b) so that token usage from - # evaluator-owned models (LLM judges) is captured — they only - # invoke their model during evaluate() at step 4. + # 3. Collect traces and configs (always attempt this) execution_usage: Optional[Dict[str, Any]] = None try: execution_configs = self.collect_all_configs() @@ -1308,10 +1305,7 @@ def _execute_task_repetition( # Task execution failed, so skip evaluation eval_results = None - # 4b. Collect usage (after evaluate() so judge/evaluator-owned - # model token usage is included). This is the only point that - # folds per-component usage into Benchmark.usage and - # Benchmark.usage_by_component (see ComponentRegistry.collect_usage). + # 5. Collect usage after evaluate() so judge/evaluator-owned model tokens are captured. try: execution_usage = self.collect_all_usage() except Exception as e: @@ -1320,7 +1314,7 @@ def _execute_task_repetition( "error_type": type(e).__name__, } - # 5. Build report — all keys always present for consistent schema + # 6. Build report — all keys always present for consistent schema report = self._build_report( task, repeat_idx, From a928c54f5a720408411855aacfa927907188488d Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sat, 23 May 2026 17:53:22 +0200 Subject: [PATCH 5/8] test: assert agent usage survives evaluator and execution failures (#60) Locks in that the post-evaluate usage collection (step 5) still runs on the eval-failure and execution-failure paths, so pre-failure agent-side usage lands in the report rather than the usage field becoming an error dict. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test_benchmark/test_usage_collection.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/tests/test_core/test_benchmark/test_usage_collection.py b/tests/test_core/test_benchmark/test_usage_collection.py index 69052acf..c0a07797 100644 --- a/tests/test_core/test_benchmark/test_usage_collection.py +++ b/tests/test_core/test_benchmark/test_usage_collection.py @@ -178,3 +178,74 @@ def test_judge_model_usage_aggregated_in_benchmark_total(self): assert "models:judge_model" in by_component, f"keys: {list(by_component)}" assert by_component["models:judge_model"].input_tokens == 100 assert by_component["models:judge_model"].output_tokens == 50 + + def test_agent_usage_captured_when_evaluation_raises(self): + """When evaluate() raises and fail_on_evaluation_error=False, the + report still carries a real usage dict (step 5 runs after step 4).""" + + class RaisingEvaluator: + def __init__(self, task, environment, user): + self.task = task + + def filter_traces(self, traces): + return traces + + def __call__(self, traces, final_answer=None): + raise RuntimeError("boom — simulated evaluator failure") + + class RaisingJudgeBenchmark(DummyBenchmark): + def setup_evaluators(self, environment, task, agents, user, seed_generator): + # Register an agent-side model with usage so we can assert + # that pre-evaluate usage still survives the eval failure. + agent_model = DummyModelAdapter( + model_id="agent_model", + usage={"input_tokens": 42, "output_tokens": 7, "total_tokens": 49}, + ) + self.register("models", "agent_model", agent_model) + # Drive the model once so it has a usage record. + agent_model.chat([{"role": "user", "content": "hi"}]) + return [RaisingEvaluator(task, environment, user)] + + tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}]) + benchmark = RaisingJudgeBenchmark(fail_on_evaluation_error=False) + + reports = benchmark.run(tasks, agent_data={"model": "test"}) + + report = reports[0] + assert report["status"] == "evaluation_failed" + usage = report["usage"] + assert isinstance(usage, dict) + assert "error" not in usage, f"usage became an error dict: {usage}" + assert usage["models"]["agent_model"]["input_tokens"] == 42 + + def test_agent_usage_captured_when_execution_raises(self): + """When run_agents raises (execution failure) and fail_on_task_error + is False, the report still carries a real usage dict. Evaluate is + skipped, but step 5 still runs.""" + from maseval.core.exceptions import AgentError + + class FailingAgentBenchmark(DummyBenchmark): + def setup_agents(self, agent_data, environment, task, user, seed_generator): + agent_model = DummyModelAdapter( + model_id="agent_model", + usage={"input_tokens": 11, "output_tokens": 3, "total_tokens": 14}, + ) + self.register("models", "agent_model", agent_model) + # Drive the model once so it has a usage record. + agent_model.chat([{"role": "user", "content": "hi"}]) + return super().setup_agents(agent_data, environment, task, user, seed_generator) + + def run_agents(self, agents, task, environment, query): + raise AgentError("simulated agent failure", component="agent") + + tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}]) + benchmark = FailingAgentBenchmark(fail_on_task_error=False) + + reports = benchmark.run(tasks, agent_data={"model": "test"}) + + report = reports[0] + assert report["status"] == "agent_error" + usage = report["usage"] + assert isinstance(usage, dict) + assert "error" not in usage, f"usage became an error dict: {usage}" + assert usage["models"]["agent_model"]["input_tokens"] == 11 From 7341950112428d4de5ab81e913b745c46f8fe43f Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sat, 23 May 2026 18:07:48 +0200 Subject: [PATCH 6/8] test: tighten failure-path regression tests per code review (#60) Address code review on a928c54: assert output_tokens (not just input_tokens), drop the redundant isinstance(usage, dict) check, mark RaisingEvaluator's unused constructor args explicit with _, and move the AgentError import to module scope. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_core/test_benchmark/test_usage_collection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_core/test_benchmark/test_usage_collection.py b/tests/test_core/test_benchmark/test_usage_collection.py index c0a07797..36cf74fc 100644 --- a/tests/test_core/test_benchmark/test_usage_collection.py +++ b/tests/test_core/test_benchmark/test_usage_collection.py @@ -6,6 +6,7 @@ import pytest from maseval import TaskQueue +from maseval.core.exceptions import AgentError from conftest import DummyBenchmark, DummyModelAdapter @@ -185,7 +186,7 @@ def test_agent_usage_captured_when_evaluation_raises(self): class RaisingEvaluator: def __init__(self, task, environment, user): - self.task = task + _ = task, environment, user def filter_traces(self, traces): return traces @@ -214,15 +215,14 @@ def setup_evaluators(self, environment, task, agents, user, seed_generator): report = reports[0] assert report["status"] == "evaluation_failed" usage = report["usage"] - assert isinstance(usage, dict) assert "error" not in usage, f"usage became an error dict: {usage}" assert usage["models"]["agent_model"]["input_tokens"] == 42 + assert usage["models"]["agent_model"]["output_tokens"] == 7 def test_agent_usage_captured_when_execution_raises(self): """When run_agents raises (execution failure) and fail_on_task_error is False, the report still carries a real usage dict. Evaluate is skipped, but step 5 still runs.""" - from maseval.core.exceptions import AgentError class FailingAgentBenchmark(DummyBenchmark): def setup_agents(self, agent_data, environment, task, user, seed_generator): @@ -246,6 +246,6 @@ def run_agents(self, agents, task, environment, query): report = reports[0] assert report["status"] == "agent_error" usage = report["usage"] - assert isinstance(usage, dict) assert "error" not in usage, f"usage became an error dict: {usage}" assert usage["models"]["agent_model"]["input_tokens"] == 11 + assert usage["models"]["agent_model"]["output_tokens"] == 3 From cfb0eea4bb1f0b9ed272ac3db2576c21cffb98a4 Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sat, 23 May 2026 18:11:12 +0200 Subject: [PATCH 7/8] docs: changelog entry for judge usage fix (#60) Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e333b1c9..8b1b226b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed MACS real-data tests passing `{"environment_data": task.environment_data}` instead of `task.environment_data` directly, which caused `setup_state` to silently receive an empty tools list. (PR: #58) - Benchmark reports from `Benchmark.run()` now have a consistent schema across every outcome. Setup failures, setup timeouts, and unexpected worker failures in parallel runs previously produced reports missing the `usage` and `task` keys (with empty `traces`/`config`). Every report now always includes `task_id`, `repeat_idx`, `status`, `error`, `traces`, `config`, `usage`, `eval`, and `task`, and `report["error"]` is always populated whenever `status` is not `SUCCESS`. (PR: #61) - `fail_on_setup_error`, `fail_on_task_error`, and `fail_on_evaluation_error` now abort a parallel `Benchmark.run()` the same way they abort a sequential run. Previously a parallel run swallowed the failure into a degraded report and kept going. (PR: #61) +- Token usage and cost for LLM judges and other evaluator-owned models are now correctly captured in per-task reports (`report["usage"]`) and in `benchmark.usage` / `benchmark.usage_by_component`. Previously these entries showed zero tokens because the usage snapshot was taken before evaluators ran. Affects every benchmark that registers a model in `setup_evaluators` (e.g. ConVerse, MultiAgentBench). (PR: #PR_NUMBER_PLACEHOLDER) ### Removed From a5fa7048da590a9cbd1c620cce00cce1b0092e77 Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sat, 23 May 2026 18:12:52 +0200 Subject: [PATCH 8/8] docs: fill in PR number for changelog entry Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b1b226b..adad9eed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,7 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed MACS real-data tests passing `{"environment_data": task.environment_data}` instead of `task.environment_data` directly, which caused `setup_state` to silently receive an empty tools list. (PR: #58) - Benchmark reports from `Benchmark.run()` now have a consistent schema across every outcome. Setup failures, setup timeouts, and unexpected worker failures in parallel runs previously produced reports missing the `usage` and `task` keys (with empty `traces`/`config`). Every report now always includes `task_id`, `repeat_idx`, `status`, `error`, `traces`, `config`, `usage`, `eval`, and `task`, and `report["error"]` is always populated whenever `status` is not `SUCCESS`. (PR: #61) - `fail_on_setup_error`, `fail_on_task_error`, and `fail_on_evaluation_error` now abort a parallel `Benchmark.run()` the same way they abort a sequential run. Previously a parallel run swallowed the failure into a degraded report and kept going. (PR: #61) -- Token usage and cost for LLM judges and other evaluator-owned models are now correctly captured in per-task reports (`report["usage"]`) and in `benchmark.usage` / `benchmark.usage_by_component`. Previously these entries showed zero tokens because the usage snapshot was taken before evaluators ran. Affects every benchmark that registers a model in `setup_evaluators` (e.g. ConVerse, MultiAgentBench). (PR: #PR_NUMBER_PLACEHOLDER) +- Token usage and cost for LLM judges and other evaluator-owned models are now correctly captured in per-task reports (`report["usage"]`) and in `benchmark.usage` / `benchmark.usage_by_component`. Previously these entries showed zero tokens because the usage snapshot was taken before evaluators ran. Affects every benchmark that registers a model in `setup_evaluators` (e.g. ConVerse, MultiAgentBench). (PR: #63) ### Removed