From 501b730e81e97c519e7140963cd7b2b9db4f1ab1 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 16:55:26 -0800
Subject: [PATCH 01/46] test: Capture baseline metric routing behavior

---
 tests/unit/test_metric_routing.py | 66 +++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 tests/unit/test_metric_routing.py

diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py
new file mode 100644
index 00000000..db297e22
--- /dev/null
+++ b/tests/unit/test_metric_routing.py
@@ -0,0 +1,66 @@
+import json
+import os
+from pathlib import Path
+import types
+from unittest.mock import MagicMock, patch
+
+from art import Model
+
+
+class TestMetricRoutingBaseline:
+    def test_log_metrics_prefixes_all_keys_with_split(self, tmp_path: Path) -> None:
+        model = Model(
+            name="test-model",
+            project="test-project",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        model._log_metrics(
+            {
+                "reward/mean": 0.9,
+                "custom": 1.0,
+            },
+            split="train",
+            step=7,
+        )
+
+        history_path = tmp_path / "test-project/models/test-model/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["train/reward/mean"] == 0.9
+        assert entry["train/custom"] == 1.0
+        assert "reward/mean" not in entry
+        assert "training_step" not in entry
+        assert "time/wall_clock_sec" not in entry
+
+    def test_get_wandb_run_registers_existing_sections(self, tmp_path: Path) -> None:
+        fake_run = MagicMock()
+        fake_run._is_finished = False
+
+        fake_wandb = types.SimpleNamespace()
+        fake_wandb.init = MagicMock(return_value=fake_run)
+        fake_wandb.define_metric = MagicMock()
+        fake_wandb.Settings = lambda **kwargs: kwargs
+
+        with patch.dict(os.environ, {"WANDB_API_KEY": "test-key"}, clear=False):
+            with patch.dict("sys.modules", {"wandb": fake_wandb}):
+                model = Model(
+                    name="test-model",
+                    project="test-project",
+                    base_path=str(tmp_path),
+                )
+                run = model._get_wandb_run()
+
+        assert run is fake_run
+        define_calls = [
+            (call.args, call.kwargs)
+            for call in fake_wandb.define_metric.call_args_list
+        ]
+        assert define_calls == [
+            (("training_step",), {}),
+            (("train/*",), {"step_metric": "training_step"}),
+            (("val/*",), {"step_metric": "training_step"}),
+            (("costs/*",), {"step_metric": "training_step"}),
+        ]

From 6d4f68938031e9eac2d16e6df4008f1e2be0fb20 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 16:57:17 -0800
Subject: [PATCH 02/46] feat: Add section-aware metric routing and W&B taxonomy
 registration

---
 src/art/model.py                    | 46 +++++++++++++++++++++++++++--
 tests/unit/test_frontend_logging.py | 14 ++++++++-
 tests/unit/test_metric_routing.py   | 24 ++++++++++-----
 3 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/src/art/model.py b/src/art/model.py
index a5b13582..7d5e7df2 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -2,6 +2,7 @@
 from datetime import datetime
 import json
 import os
+import time
 from typing import TYPE_CHECKING, Any, Generic, Iterable, Optional, cast, overload
 import warnings
 
@@ -29,6 +30,19 @@
 COSTS_STATE_KEY = "_costs"
 COSTS_METRIC_PREFIX = "costs_"
 COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total"
+METRIC_SECTIONS = frozenset(
+    {
+        "reward",
+        "loss",
+        "offpolicy",
+        "pipeline",
+        "throughput",
+        "costs",
+        "time",
+        "data",
+    }
+)
+METRIC_SPLITS = frozenset({"train", "val", "test"})
 
 
 class Model(
@@ -93,6 +107,7 @@ class Model(
     _s3_prefix: str | None = None
     _openai_client: AsyncOpenAI | None = None
     _wandb_run: Optional["Run"] = None  # Private, for lazy wandb initialization
+    _run_start_time: float
     _costs_lock: asyncio.Lock
     _cost_calculator: CostCalculator
 
@@ -123,6 +138,7 @@ def __init__(
             report_metrics=report_metrics,
             **kwargs,
         )
+        object.__setattr__(self, "_run_start_time", time.time())
 
     @overload
     def __new__(
@@ -380,9 +396,16 @@ def _get_wandb_run(self) -> Optional["Run"]:
             # Define training_step as the x-axis for all metrics.
             # This allows out-of-order logging (e.g., async validation for previous steps).
             wandb.define_metric("training_step")
+            wandb.define_metric("time/wall_clock_sec")
+            wandb.define_metric("reward/*", step_metric="training_step")
+            wandb.define_metric("loss/*", step_metric="training_step")
+            wandb.define_metric("throughput/*", step_metric="training_step")
+            wandb.define_metric("costs/*", step_metric="training_step")
+            wandb.define_metric("time/*", step_metric="training_step")
+            wandb.define_metric("data/*", step_metric="training_step")
             wandb.define_metric("train/*", step_metric="training_step")
             wandb.define_metric("val/*", step_metric="training_step")
-            wandb.define_metric("costs/*", step_metric="training_step")
+            wandb.define_metric("test/*", step_metric="training_step")
         return self._wandb_run
 
     def _log_metrics(
@@ -392,7 +415,24 @@ def _log_metrics(
         step: int,
     ) -> None:
         """Log metrics to history.jsonl and optionally wandb."""
-        prefixed = {f"{split}/{k}": v for k, v in metrics.items()}
+        if split in METRIC_SPLITS:
+            prefixed = {}
+            for key, value in metrics.items():
+                first_component = key.split("/", 1)[0]
+                has_prefix_component = "/" in key
+                if has_prefix_component and (
+                    first_component in METRIC_SECTIONS
+                    or first_component in METRIC_SPLITS
+                ):
+                    prefixed[key] = value
+                else:
+                    prefixed[f"{split}/{key}"] = value
+        else:
+            prefixed = {f"{split}/{k}": v for k, v in metrics.items()}
+
+        prefixed["training_step"] = step
+        prefixed["time/wall_clock_sec"] = time.time() - self._run_start_time
+
         output_dir = self._get_output_dir()
 
         # Ensure output directory exists
@@ -416,7 +456,7 @@ def _log_metrics(
         ) or (self.report_metrics is not None and "wandb" in self.report_metrics)
         if should_log_wandb:
             if run := self._get_wandb_run():
-                run.log({"training_step": step, **prefixed})
+                run.log(prefixed)
 
     async def _record_costs(
         self,
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 20278589..ed5ed4a0 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -337,10 +337,22 @@ async def test_metric_prefixes(self, tmp_path: Path):
             entry = json.loads(f.readline())
 
         # All metrics should be prefixed (except step and recorded_at)
-        metric_keys = [k for k in entry.keys() if k not in ["step", "recorded_at"]]
+        metric_keys = [
+            k
+            for k in entry.keys()
+            if k
+            not in [
+                "step",
+                "recorded_at",
+                "training_step",
+                "time/wall_clock_sec",
+            ]
+        ]
         assert all(k.startswith("val/") for k in metric_keys), (
             f"Not all metrics prefixed: {metric_keys}"
         )
+        assert entry["training_step"] == 0
+        assert entry["time/wall_clock_sec"] >= 0
 
     @pytest.mark.asyncio
     async def test_standard_metrics_present(self, tmp_path: Path):
diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py
index db297e22..e83a48ed 100644
--- a/tests/unit/test_metric_routing.py
+++ b/tests/unit/test_metric_routing.py
@@ -8,7 +8,9 @@
 
 
 class TestMetricRoutingBaseline:
-    def test_log_metrics_prefixes_all_keys_with_split(self, tmp_path: Path) -> None:
+    def test_log_metrics_routes_known_sections_without_split_prefix(
+        self, tmp_path: Path
+    ) -> None:
         model = Model(
             name="test-model",
             project="test-project",
@@ -20,6 +22,7 @@ def test_log_metrics_prefixes_all_keys_with_split(self, tmp_path: Path) -> None:
             {
                 "reward/mean": 0.9,
                 "custom": 1.0,
+                "rewardish/value": 2.0,
             },
             split="train",
             step=7,
@@ -29,13 +32,13 @@ def test_log_metrics_prefixes_all_keys_with_split(self, tmp_path: Path) -> None:
         with open(history_path) as f:
             entry = json.loads(f.readline())
 
-        assert entry["train/reward/mean"] == 0.9
+        assert entry["reward/mean"] == 0.9
         assert entry["train/custom"] == 1.0
-        assert "reward/mean" not in entry
-        assert "training_step" not in entry
-        assert "time/wall_clock_sec" not in entry
+        assert entry["train/rewardish/value"] == 2.0
+        assert entry["training_step"] == 7
+        assert entry["time/wall_clock_sec"] >= 0
 
-    def test_get_wandb_run_registers_existing_sections(self, tmp_path: Path) -> None:
+    def test_get_wandb_run_registers_taxonomy_sections(self, tmp_path: Path) -> None:
         fake_run = MagicMock()
         fake_run._is_finished = False
 
@@ -60,7 +63,14 @@ def test_get_wandb_run_registers_existing_sections(self, tmp_path: Path) -> None
         ]
         assert define_calls == [
             (("training_step",), {}),
+            (("time/wall_clock_sec",), {}),
+            (("reward/*",), {"step_metric": "training_step"}),
+            (("loss/*",), {"step_metric": "training_step"}),
+            (("throughput/*",), {"step_metric": "training_step"}),
+            (("costs/*",), {"step_metric": "training_step"}),
+            (("time/*",), {"step_metric": "training_step"}),
+            (("data/*",), {"step_metric": "training_step"}),
             (("train/*",), {"step_metric": "training_step"}),
             (("val/*",), {"step_metric": "training_step"}),
-            (("costs/*",), {"step_metric": "training_step"}),
+            (("test/*",), {"step_metric": "training_step"}),
         ]

From 498a0cdc0e0ba2e4dc8b2c2b2c0e0b4374a0add3 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 17:15:02 -0800
Subject: [PATCH 03/46] feat: Add MetricsBuilder with hierarchical cost rollups

---
 src/art/metrics.py                 | 174 +++++++++++++++++++++++++++++
 tests/unit/test_metrics_builder.py | 161 ++++++++++++++++++++++++++
 2 files changed, 335 insertions(+)
 create mode 100644 src/art/metrics.py
 create mode 100644 tests/unit/test_metrics_builder.py

diff --git a/src/art/metrics.py b/src/art/metrics.py
new file mode 100644
index 00000000..4ff89f00
--- /dev/null
+++ b/src/art/metrics.py
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+import asyncio
+from contextvars import ContextVar, Token
+from typing import Any
+
+_active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder")
+
+_HIERARCHICAL_SECTIONS = {"costs", "time", "data"}
+
+
+class MetricsBuilder:
+    """Build and accumulate step-level metrics for logging."""
+
+    def __init__(self, cost_context: str) -> None:
+        if not cost_context:
+            raise ValueError("cost_context must be non-empty")
+
+        self.cost_context = cost_context
+        self._lock = asyncio.Lock()
+        self._step_buffer: dict[str, float] = {}
+        self._cum_state: dict[str, float] = {}
+        self._unique_scenario_ids: set[str] = set()
+
+    def add_cost(self, path: str, usd: float) -> None:
+        if not path:
+            raise ValueError("Cost path must be non-empty")
+        full_key = f"costs/{path}"
+        self._validate_and_add(full_key, float(usd))
+
+    def add_data(
+        self,
+        step_num_scenarios: int | None = None,
+        step_actor_tokens: int | None = None,
+        scenario_ids: list[str] | None = None,
+    ) -> None:
+        if step_num_scenarios is not None:
+            self._step_buffer["data/step_num_scenarios"] = float(step_num_scenarios)
+        if step_actor_tokens is not None:
+            self._step_buffer["data/step_actor_tokens"] = float(step_actor_tokens)
+        if scenario_ids is not None:
+            self._unique_scenario_ids.update(scenario_ids)
+
+    def add_user_timing(
+        self,
+        step_wall_s: float | None = None,
+        step_actor_s: float | None = None,
+        step_eval_s: float | None = None,
+    ) -> None:
+        if step_wall_s is not None:
+            self._step_buffer["time/step_wall_s"] = float(step_wall_s)
+        if step_actor_s is not None:
+            self._step_buffer["time/step_actor_s"] = float(step_actor_s)
+        if step_eval_s is not None:
+            self._step_buffer["time/step_eval_s"] = float(step_eval_s)
+
+    def add_idle_times(
+        self,
+        step_trainer_idle_s: float | None = None,
+        step_actor_idle_s: float | None = None,
+    ) -> None:
+        if step_trainer_idle_s is not None:
+            self._step_buffer["throughput/step_trainer_idle_s"] = float(
+                step_trainer_idle_s
+            )
+        if step_actor_idle_s is not None:
+            self._step_buffer["throughput/step_actor_idle_s"] = float(step_actor_idle_s)
+
+    async def flush(self, step: int) -> dict[str, float]:
+        del step
+        async with self._lock:
+            self._validate_hierarchy()
+
+            result = dict(self._step_buffer)
+            cost_metrics = {
+                key: value
+                for key, value in self._step_buffer.items()
+                if key.startswith("costs/")
+            }
+            result.update(self._compute_rollups(cost_metrics))
+
+            for key, value in list(result.items()):
+                section = key.split("/", 1)[0]
+                if section not in _HIERARCHICAL_SECTIONS:
+                    continue
+                cum_key = f"{key}_cum"
+                next_value = self._cum_state.get(cum_key, 0.0) + value
+                self._cum_state[cum_key] = next_value
+                result[cum_key] = next_value
+
+            if self._unique_scenario_ids:
+                result["data/cum_num_unique_scenarios"] = float(
+                    len(self._unique_scenario_ids)
+                )
+
+            self._step_buffer.clear()
+            return result
+
+    def activate(self) -> Token["MetricsBuilder"]:
+        return _active_builder.set(self)
+
+    @staticmethod
+    def get_active() -> "MetricsBuilder":
+        return _active_builder.get()
+
+    def state_dict(self) -> dict[str, Any]:
+        return {
+            "cum_state": dict(self._cum_state),
+            "unique_scenario_ids": list(self._unique_scenario_ids),
+        }
+
+    def load_state_dict(self, state: dict[str, Any]) -> None:
+        raw_cum_state = state.get("cum_state", {})
+        raw_unique_ids = state.get("unique_scenario_ids", [])
+        self._cum_state = {str(k): float(v) for k, v in raw_cum_state.items()}
+        self._unique_scenario_ids = {str(v) for v in raw_unique_ids}
+
+    def _validate_and_add(self, key: str, value: float) -> None:
+        if key.endswith("_cum"):
+            raise ValueError(
+                f"Metric key '{key}' ends with '_cum', which is reserved for cumulative metrics."
+            )
+
+        for existing_key in self._step_buffer:
+            if existing_key == key:
+                continue
+            if existing_key.startswith(f"{key}/"):
+                raise ValueError(
+                    f"Cannot log '{key}' as a leaf: it is an ancestor of '{existing_key}'."
+                )
+            if key.startswith(f"{existing_key}/"):
+                raise ValueError(
+                    f"Cannot log '{key}' as a leaf: '{existing_key}' is already a leaf ancestor."
+                )
+
+        self._step_buffer[key] = self._step_buffer.get(key, 0.0) + value
+
+    def _validate_hierarchy(self) -> None:
+        keys = sorted(k for k in self._step_buffer if k.startswith("costs/"))
+        for i, key in enumerate(keys):
+            for other in keys[i + 1 :]:
+                if other.startswith(f"{key}/"):
+                    raise ValueError(
+                        f"Leaf/parent conflict: '{key}' and '{other}' cannot coexist."
+                    )
+
+    def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]:
+        if not cost_metrics:
+            return {}
+
+        all_parents: set[str] = set()
+        for key in cost_metrics:
+            parts = key.split("/")
+            for depth in range(2, len(parts)):
+                all_parents.add("/".join(parts[:depth]))
+
+        rollups: dict[str, float] = {}
+        for parent in all_parents:
+            prefix = f"{parent}/"
+            rollups[parent] = sum(
+                value for key, value in cost_metrics.items() if key.startswith(prefix)
+            )
+
+        top_level_children = {key.split("/")[1] for key in cost_metrics}
+        costs_all = 0.0
+        for child_name in top_level_children:
+            child_key = f"costs/{child_name}"
+            if child_key in rollups:
+                costs_all += rollups[child_key]
+            else:
+                costs_all += cost_metrics[child_key]
+        rollups["costs/all"] = costs_all
+
+        return rollups
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
new file mode 100644
index 00000000..7b56c150
--- /dev/null
+++ b/tests/unit/test_metrics_builder.py
@@ -0,0 +1,161 @@
+import asyncio
+
+import pytest
+
+from art.metrics import MetricsBuilder
+
+
+class TestMetricsBuilder:
+    @pytest.mark.asyncio
+    async def test_rollup_correctness_across_depths(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_cost("train/llm_judge/general_judge", usd=0.08)
+        builder.add_cost("train/llm_judge/hallucination_judge", usd=0.04)
+        builder.add_cost("train/tinker_train", usd=1.20)
+        builder.add_cost("train/tinker_inference", usd=0.45)
+        builder.add_cost("eval/llm_judge/correctness", usd=0.06)
+
+        metrics = await builder.flush(step=1)
+
+        assert metrics["costs/train/llm_judge"] == pytest.approx(0.12)
+        assert metrics["costs/train"] == pytest.approx(1.77)
+        assert metrics["costs/eval"] == pytest.approx(0.06)
+        assert metrics["costs/all"] == pytest.approx(1.83)
+        assert metrics["costs/train/llm_judge_cum"] == pytest.approx(0.12)
+        assert metrics["costs/train_cum"] == pytest.approx(1.77)
+        assert metrics["costs/all_cum"] == pytest.approx(1.83)
+
+    @pytest.mark.asyncio
+    async def test_cum_accumulates_for_hierarchical_sections(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        builder.add_user_timing(step_wall_s=1.5, step_actor_s=0.3)
+        builder.add_data(
+            step_num_scenarios=2,
+            step_actor_tokens=10,
+            scenario_ids=["a", "b"],
+        )
+        first = await builder.flush(step=1)
+
+        assert first["time/step_wall_s_cum"] == pytest.approx(1.5)
+        assert first["time/step_actor_s_cum"] == pytest.approx(0.3)
+        assert first["data/step_num_scenarios_cum"] == pytest.approx(2)
+        assert first["data/step_actor_tokens_cum"] == pytest.approx(10)
+        assert first["data/cum_num_unique_scenarios"] == 2
+
+        builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2)
+        builder.add_data(
+            step_num_scenarios=3,
+            step_actor_tokens=5,
+            scenario_ids=["b", "c"],
+        )
+        second = await builder.flush(step=2)
+
+        assert second["time/step_wall_s_cum"] == pytest.approx(2.0)
+        assert second["time/step_actor_s_cum"] == pytest.approx(0.5)
+        assert second["data/step_num_scenarios_cum"] == pytest.approx(5)
+        assert second["data/step_actor_tokens_cum"] == pytest.approx(15)
+        assert second["data/cum_num_unique_scenarios"] == 3
+
+    @pytest.mark.asyncio
+    async def test_costs_all_generated_for_single_and_multiple_children(self) -> None:
+        single = MetricsBuilder(cost_context="train")
+        single.add_cost("train/gpu", usd=2.0)
+        one = await single.flush(step=1)
+        assert one["costs/all"] == pytest.approx(2.0)
+
+        multi = MetricsBuilder(cost_context="train")
+        multi.add_cost("train/gpu", usd=2.0)
+        multi.add_cost("eval/llm_judge/correctness", usd=0.5)
+        two = await multi.flush(step=1)
+        assert two["costs/all"] == pytest.approx(2.5)
+
+    def test_leaf_parent_conflicts_raise(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_cost("train", usd=1.0)
+        with pytest.raises(ValueError):
+            builder.add_cost("train/llm_judge", usd=0.1)
+
+        other = MetricsBuilder(cost_context="train")
+        other.add_cost("train/llm_judge", usd=0.1)
+        with pytest.raises(ValueError):
+            other.add_cost("train", usd=1.0)
+
+    @pytest.mark.asyncio
+    async def test_duplicate_leaf_writes_are_summed(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_cost("train/gpu", usd=1.25)
+        builder.add_cost("train/gpu", usd=0.75)
+
+        metrics = await builder.flush(step=1)
+
+        assert metrics["costs/train/gpu"] == pytest.approx(2.0)
+        assert metrics["costs/train"] == pytest.approx(2.0)
+        assert metrics["costs/all"] == pytest.approx(2.0)
+
+    def test_cum_suffix_is_reserved(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        with pytest.raises(ValueError):
+            builder.add_cost("train/llm_judge_cum", usd=0.1)
+
+    @pytest.mark.asyncio
+    async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_cost("train/gpu", usd=1.0)
+        first = await builder.flush(step=1)
+        assert first["costs/train_cum"] == pytest.approx(1.0)
+
+        second = await builder.flush(step=2)
+        assert not any(key.startswith("costs/") for key in second)
+
+        builder.add_cost("train/gpu", usd=2.0)
+        third = await builder.flush(step=3)
+        assert third["costs/train"] == pytest.approx(2.0)
+        assert third["costs/train_cum"] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None:
+        before = MetricsBuilder(cost_context="train")
+        before.add_cost("train/gpu", usd=1.0)
+        await before.flush(step=1)
+
+        state = before.state_dict()
+        after = MetricsBuilder(cost_context="train")
+        after.load_state_dict(state)
+        after.add_cost("train/gpu", usd=2.0)
+
+        metrics = await after.flush(step=2)
+        assert metrics["costs/train_cum"] == pytest.approx(3.0)
+        assert metrics["costs/all_cum"] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_unique_scenario_count_tracks_exact_ids(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_data(scenario_ids=["s1", "s2", "s3"])
+        first = await builder.flush(step=1)
+        assert first["data/cum_num_unique_scenarios"] == 3
+
+        builder.add_data(scenario_ids=["s2", "s4"])
+        second = await builder.flush(step=2)
+        assert second["data/cum_num_unique_scenarios"] == 4
+
+    @pytest.mark.asyncio
+    async def test_concurrent_add_cost_calls_do_not_lose_updates(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        async def worker() -> None:
+            for _ in range(25):
+                builder.add_cost("train/gpu", usd=0.1)
+                await asyncio.sleep(0)
+
+        await asyncio.gather(*(worker() for _ in range(4)))
+        metrics = await builder.flush(step=1)
+
+        assert metrics["costs/train/gpu"] == pytest.approx(10.0)
+        assert metrics["costs/all"] == pytest.approx(10.0)
+
+    def test_contextvar_activate_and_get_active(self) -> None:
+        builder = MetricsBuilder(cost_context="eval")
+        token = builder.activate()
+        assert MetricsBuilder.get_active() is builder
+        token.var.reset(token)

From c4e848c5c579dc0a7cc3a0476af25ca21238b635 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 17:25:52 -0800
Subject: [PATCH 04/46] test: Capture baseline train trajectory metric routing

---
 tests/unit/test_frontend_logging.py | 36 +++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index ed5ed4a0..83251f7e 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -467,6 +467,42 @@ async def test_exception_rate_calculation(self, tmp_path: Path):
         # All successful trajectories = 0% exception rate
         assert entry["val/exception_rate"] == 0.0
 
+    @pytest.mark.asyncio
+    async def test_train_trajectory_metrics_default_to_train_prefix_baseline(
+        self, tmp_path: Path
+    ):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        trajectories = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.7,
+                        metrics={
+                            "custom_score": 1.0,
+                            "reward/prefixed": 2.0,
+                        },
+                        messages_and_choices=[{"role": "user", "content": "test"}],
+                    )
+                ],
+                exceptions=[],
+            )
+        ]
+
+        await model.log(trajectories, split="train")
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["train/custom_score"] == 1.0
+        assert entry["reward/prefixed"] == 2.0
+
 
 class TestWandbIntegration:
     """Test wandb integration logic (without mocking wandb itself)."""

From 1d9bf2616bc87f1d05544d5133b040a4d7ff799d Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 17:27:52 -0800
Subject: [PATCH 05/46] feat: Route train trajectory metrics and log costs via
 MetricsBuilder

---
 src/art/model.py                    | 189 ++++++++--------------------
 tests/unit/test_frontend_logging.py |  47 ++++++-
 2 files changed, 97 insertions(+), 139 deletions(-)

diff --git a/src/art/model.py b/src/art/model.py
index 7d5e7df2..18a1a9ac 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -14,6 +14,7 @@
 
 from . import dev
 from .costs import CostCalculator
+from .metrics import MetricsBuilder
 from .trajectories import Trajectory, TrajectoryGroup
 from .types import TrainConfig, TrainSFTConfig
 from .utils.trajectory_logging import write_trajectory_groups_parquet
@@ -27,7 +28,6 @@
 ModelConfig = TypeVar("ModelConfig", bound=BaseModel | None)
 StateType = TypeVar("StateType", bound=dict[str, Any], default=dict[str, Any])
 
-COSTS_STATE_KEY = "_costs"
 COSTS_METRIC_PREFIX = "costs_"
 COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total"
 METRIC_SECTIONS = frozenset(
@@ -108,7 +108,7 @@ class Model(
     _openai_client: AsyncOpenAI | None = None
     _wandb_run: Optional["Run"] = None  # Private, for lazy wandb initialization
     _run_start_time: float
-    _costs_lock: asyncio.Lock
+    _metrics_builder: MetricsBuilder
     _cost_calculator: CostCalculator
 
     def __init__(
@@ -139,6 +139,7 @@ def __init__(
             **kwargs,
         )
         object.__setattr__(self, "_run_start_time", time.time())
+        object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train"))
 
     @overload
     def __new__(
@@ -458,63 +459,30 @@ def _log_metrics(
             if run := self._get_wandb_run():
                 run.log(prefixed)
 
-    async def _record_costs(
-        self,
-        split: str,
-        step: int,
-        *,
-        cost_components: dict[str, float],
-        cost_total_direct: float,
-        cost_seen: bool,
-    ) -> None:
-        component_total = sum(cost_components.values())
-        step_total = component_total if component_total > 0 else cost_total_direct
-        if not cost_seen or step_total <= 0:
-            return
-
-        async with self._costs_lock:
-            existing_state = self.read_state() or {}
-            raw_costs = existing_state.get(COSTS_STATE_KEY) or {}
-            cumulative = {
-                key: float(value)
-                for key, value in raw_costs.items()
-                if isinstance(value, (int, float))
-            }
-            last_steps = raw_costs.get("_last_steps")
-            if not isinstance(last_steps, dict):
-                last_steps = {}
-            last_step = last_steps.get(split)
-
-            if isinstance(last_step, (int, float)) and int(last_step) >= step:
-                for component, value in cost_components.items():
-                    if value == 0:
-                        continue
-                    cumulative_key = f"{split}_{component}"
-                    cumulative[cumulative_key] = max(
-                        cumulative.get(cumulative_key, 0.0), value
-                    )
-                cumulative[split] = max(cumulative.get(split, 0.0), step_total)
-                cumulative["total"] = max(
-                    cumulative.get("total", 0.0), cumulative.get(split, 0.0)
-                )
-                self.merge_state(
-                    {COSTS_STATE_KEY: {**cumulative, "_last_steps": last_steps}}
+    def _extract_non_cost_metrics(
+        self, metrics: dict[str, float], split: str
+    ) -> dict[str, float]:
+        non_cost_metrics: dict[str, float] = {}
+        cost_context = "train" if split == "train" else "eval"
+        for metric, value in metrics.items():
+            numeric_value = float(value)
+            if metric == COSTS_TOTAL_KEY:
+                raise ValueError(
+                    "Do not log 'costs_total' directly. Log costs_* components "
+                    "(e.g., costs_prefill, costs_sample) and totals are derived."
                 )
-                self._log_metrics(cumulative, "costs", step)
-                return
-
-            for component, value in cost_components.items():
-                if value == 0:
-                    continue
-                cumulative_key = f"{split}_{component}"
-                cumulative[cumulative_key] = cumulative.get(cumulative_key, 0.0) + value
-            cumulative[split] = cumulative.get(split, 0.0) + step_total
-            cumulative["total"] = cumulative.get("total", 0.0) + step_total
-            last_steps[split] = step
-            self.merge_state(
-                {COSTS_STATE_KEY: {**cumulative, "_last_steps": last_steps}}
-            )
-            self._log_metrics(cumulative, "costs", step)
+            if metric.startswith("costs/"):
+                self._metrics_builder.add_cost(metric[len("costs/") :], numeric_value)
+                continue
+            if metric.startswith(COSTS_METRIC_PREFIX):
+                component = metric[len(COSTS_METRIC_PREFIX) :]
+                if component:
+                    self._metrics_builder.add_cost(
+                        f"{cost_context}/{component}", numeric_value
+                    )
+                continue
+            non_cost_metrics[metric] = numeric_value
+        return non_cost_metrics
 
     async def log(
         self,
@@ -549,42 +517,12 @@ async def log(
         # If only metrics provided (no trajectories), just log them and return
         if trajectories is None:
             if metrics is not None:
-                cost_step = await self.get_step()
-                cost_components: dict[str, float] = {}
-                cost_total_direct = 0.0
-                cost_seen = False
-
-                for metric, value in metrics.items():
-                    if not isinstance(value, (int, float)):
-                        continue
-                    if metric == COSTS_TOTAL_KEY:
-                        raise ValueError(
-                            "Do not log 'costs_total' directly. Log costs_* components "
-                            "(e.g., costs_prefill, costs_sample) and totals are derived."
-                        )
-                    elif metric.startswith(COSTS_METRIC_PREFIX):
-                        component = metric[len(COSTS_METRIC_PREFIX) :]
-                        if component:
-                            cost_components[component] = cost_components.get(
-                                component, 0.0
-                            ) + float(value)
-                            cost_seen = True
-
-                metrics_without_costs = {
-                    key: value
-                    for key, value in metrics.items()
-                    if not key.startswith(COSTS_METRIC_PREFIX)
-                }
+                metrics_without_costs = self._extract_non_cost_metrics(metrics, split)
                 if metrics_without_costs:
                     self._log_metrics(metrics_without_costs, split, step)
-
-                await self._record_costs(
-                    split,
-                    cost_step,
-                    cost_components=cost_components,
-                    cost_total_direct=cost_total_direct,
-                    cost_seen=cost_seen,
-                )
+                costs = await self._metrics_builder.flush(step)
+                if costs:
+                    self._log_metrics(costs, split, step)
             return
 
         # Convert to list[TrajectoryGroup]
@@ -611,38 +549,18 @@ async def log(
         )
 
         # 2. Calculate aggregate metrics (excluding additive costs)
-        cost_step = await self.get_step()
         all_metrics: dict[str, list[float]] = {"reward": [], "exception_rate": []}
         group_metrics: dict[str, list[float]] = {}
-        cost_components: dict[str, float] = {}
-        cost_total_direct = 0.0
-        cost_seen = False
-
-        def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None:
-            nonlocal cost_total_direct, cost_seen
-            for metric, value in metrics_dict.items():
-                if not isinstance(value, (int, float)):
-                    continue
-                if metric == COSTS_TOTAL_KEY:
-                    raise ValueError(
-                        "Do not log 'costs_total' directly. Log costs_* components "
-                        "(e.g., costs_prefill, costs_sample) and totals are derived."
-                    )
-                elif metric.startswith(COSTS_METRIC_PREFIX):
-                    component = metric[len(COSTS_METRIC_PREFIX) :]
-                    if component:
-                        cost_components[component] = cost_components.get(
-                            component, 0.0
-                        ) + float(value)
-                        cost_seen = True
 
         for group in trajectory_groups:
             if group.metrics:
-                _add_costs(group.metrics)
+                group_non_cost = self._extract_non_cost_metrics(
+                    cast(dict[str, float], group.metrics), split
+                )
+            else:
+                group_non_cost = {}
             if group.trajectories:
-                for metric, value in group.metrics.items():
-                    if metric.startswith(COSTS_METRIC_PREFIX):
-                        continue
+                for metric, value in group_non_cost.items():
                     if metric not in group_metrics:
                         group_metrics[metric] = []
                     group_metrics[metric].append(float(value))
@@ -656,14 +574,21 @@ def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None:
                 all_metrics["reward"].append(trajectory.reward)
 
                 # Collect other custom metrics
+                trajectory_metrics: dict[str, float] = {}
                 for metric, value in trajectory.metrics.items():
-                    if metric.startswith(COSTS_METRIC_PREFIX):
-                        continue
+                    routed_metric = metric
+                    if split == "train" and "/" not in routed_metric:
+                        routed_metric = f"reward/{routed_metric}"
+                    trajectory_metrics[routed_metric] = float(value)
+
+                non_cost_trajectory_metrics = self._extract_non_cost_metrics(
+                    trajectory_metrics,
+                    split,
+                )
+                for metric, value in non_cost_trajectory_metrics.items():
                     if metric not in all_metrics:
                         all_metrics[metric] = []
                     all_metrics[metric].append(float(value))
-                if trajectory.metrics:
-                    _add_costs(trajectory.metrics)
 
         # Calculate averages for all metrics
         averages: dict[str, float] = {}
@@ -685,25 +610,16 @@ def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None:
 
         # Merge in any additional metrics passed directly
         if metrics is not None:
-            _add_costs(metrics)
-            metrics_without_costs = {
-                key: value
-                for key, value in metrics.items()
-                if not key.startswith(COSTS_METRIC_PREFIX)
-            }
+            metrics_without_costs = self._extract_non_cost_metrics(metrics, split)
             averages.update(metrics_without_costs)
 
         # 3. Log metrics (writes to history.jsonl and wandb)
         self._log_metrics(averages, split, step)
 
-        # 4. Log cumulative costs (additive)
-        await self._record_costs(
-            split,
-            cost_step,
-            cost_components=cost_components,
-            cost_total_direct=cost_total_direct,
-            cost_seen=cost_seen,
-        )
+        # 4. Log cumulative costs
+        costs = await self._metrics_builder.flush(step)
+        if costs:
+            self._log_metrics(costs, split, step)
 
     async def get_step(self) -> int:
         """
@@ -754,7 +670,6 @@ def __init__(
             report_metrics=report_metrics,
             **kwargs,
         )
-        object.__setattr__(self, "_costs_lock", asyncio.Lock())
         object.__setattr__(self, "_cost_calculator", self._noop_cost_calculator)
         if _internal_config is not None:
             # Bypass BaseModel __setattr__ to allow setting private attr
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 83251f7e..1870f933 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -468,7 +468,7 @@ async def test_exception_rate_calculation(self, tmp_path: Path):
         assert entry["val/exception_rate"] == 0.0
 
     @pytest.mark.asyncio
-    async def test_train_trajectory_metrics_default_to_train_prefix_baseline(
+    async def test_train_trajectory_metrics_default_to_reward_prefix(
         self, tmp_path: Path
     ):
         model = Model(
@@ -500,9 +500,52 @@ async def test_train_trajectory_metrics_default_to_train_prefix_baseline(
         with open(history_path) as f:
             entry = json.loads(f.readline())
 
-        assert entry["train/custom_score"] == 1.0
+        assert entry["reward/custom_score"] == 1.0
         assert entry["reward/prefixed"] == 2.0
 
+    @pytest.mark.asyncio
+    async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=1,
+            metrics={
+                "costs_prefill": 0.2,
+                "costs_sample": 0.3,
+            },
+        )
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=2,
+            metrics={
+                "costs_prefill": 0.1,
+            },
+        )
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            first = json.loads(f.readline())
+            second = json.loads(f.readline())
+
+        assert first["costs/train/prefill"] == pytest.approx(0.2)
+        assert first["costs/train/sample"] == pytest.approx(0.3)
+        assert first["costs/train"] == pytest.approx(0.5)
+        assert first["costs/all"] == pytest.approx(0.5)
+        assert first["costs/all_cum"] == pytest.approx(0.5)
+
+        assert second["costs/train/prefill"] == pytest.approx(0.1)
+        assert second["costs/train/prefill_cum"] == pytest.approx(0.3)
+        assert second["costs/train_cum"] == pytest.approx(0.6)
+        assert second["costs/all_cum"] == pytest.approx(0.6)
+
 
 class TestWandbIntegration:
     """Test wandb integration logic (without mocking wandb itself)."""

From 89a58e18829c6bf77420c146ee37452524f95209 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 17:30:42 -0800
Subject: [PATCH 06/46] feat: Rename train metrics to reward, loss, and
 throughput sections

---
 src/art/model.py                              | 58 ++++++++++++++++---
 .../binary_prefix_tool_pipeline.py            |  6 +-
 tests/unit/test_frontend_logging.py           |  9 ++-
 3 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/src/art/model.py b/src/art/model.py
index 18a1a9ac..eece4e9f 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -43,6 +43,25 @@
     }
 )
 METRIC_SPLITS = frozenset({"train", "val", "test"})
+TRAIN_METRIC_KEY_RENAMES = {
+    "reward": "reward/mean",
+    "reward_std_dev": "reward/std_dev",
+    "exception_rate": "reward/exception_rate",
+    "policy_loss": "loss/train",
+    "loss": "loss/train",
+    "entropy": "loss/entropy",
+    "kl_div": "loss/kl_div",
+    "kl_policy_ref": "loss/kl_policy_ref",
+    "grad_norm": "loss/grad_norm",
+    "learning_rate": "loss/learning_rate",
+    "tokens_per_second": "throughput/train_tok_per_sec",
+    "num_groups_submitted": "train/num_groups_submitted",
+    "num_groups_trainable": "train/num_groups_trainable",
+    "num_trajectories": "train/num_trajectories",
+    "num_trainable_tokens": "train/num_trainable_tokens",
+    "train_tokens": "data/step_trainer_tokens",
+    "num_datums": "data/step_num_datums",
+}
 
 
 class Model(
@@ -481,9 +500,18 @@ def _extract_non_cost_metrics(
                         f"{cost_context}/{component}", numeric_value
                     )
                 continue
-            non_cost_metrics[metric] = numeric_value
+            routed_metric = self._rename_train_metric_key(metric, split)
+            non_cost_metrics[routed_metric] = numeric_value
         return non_cost_metrics
 
+    @staticmethod
+    def _rename_train_metric_key(metric: str, split: str) -> str:
+        if split != "train":
+            return metric
+        if metric.startswith("group_metric_"):
+            return f"reward/group_{metric[len('group_metric_'):]}"
+        return TRAIN_METRIC_KEY_RENAMES.get(metric, metric)
+
     async def log(
         self,
         trajectories: (
@@ -549,7 +577,16 @@ async def log(
         )
 
         # 2. Calculate aggregate metrics (excluding additive costs)
-        all_metrics: dict[str, list[float]] = {"reward": [], "exception_rate": []}
+        reward_key = "reward/mean" if split == "train" else "reward"
+        exception_rate_key = (
+            "reward/exception_rate" if split == "train" else "exception_rate"
+        )
+        reward_std_dev_key = "reward/std_dev" if split == "train" else "reward_std_dev"
+
+        all_metrics: dict[str, list[float]] = {
+            reward_key: [],
+            exception_rate_key: [],
+        }
         group_metrics: dict[str, list[float]] = {}
 
         for group in trajectory_groups:
@@ -566,12 +603,12 @@ async def log(
                     group_metrics[metric].append(float(value))
             for trajectory in group:
                 if isinstance(trajectory, BaseException):
-                    all_metrics["exception_rate"].append(1)
+                    all_metrics[exception_rate_key].append(1)
                     continue
                 else:
-                    all_metrics["exception_rate"].append(0)
+                    all_metrics[exception_rate_key].append(0)
                 # Add reward metric
-                all_metrics["reward"].append(trajectory.reward)
+                all_metrics[reward_key].append(trajectory.reward)
 
                 # Collect other custom metrics
                 trajectory_metrics: dict[str, float] = {}
@@ -599,14 +636,17 @@ async def log(
         # Aggregate group-level metrics once per group
         for metric, values in group_metrics.items():
             if len(values) > 0:
-                averages[f"group_metric_{metric}"] = sum(values) / len(values)
+                group_key = (
+                    f"reward/group_{metric}" if split == "train" else f"group_metric_{metric}"
+                )
+                averages[group_key] = sum(values) / len(values)
 
         # Calculate average standard deviation of rewards within groups
         from .utils.old_benchmarking.calculate_step_metrics import (
             calculate_step_std_dev,
         )
 
-        averages["reward_std_dev"] = calculate_step_std_dev(trajectory_groups)
+        averages[reward_std_dev_key] = calculate_step_std_dev(trajectory_groups)
 
         # Merge in any additional metrics passed directly
         if metrics is not None:
@@ -900,6 +940,10 @@ async def train_sft(
                 / sum(1 for d in training_metrics if k in d)
                 for k in {k for d in training_metrics for k in d}
             }
+            avg_metrics = {
+                self._rename_train_metric_key(key, "train"): value
+                for key, value in avg_metrics.items()
+            }
             # Get the current step after training
             step = await self.get_step()
             self._log_metrics(avg_metrics, "train", step)
diff --git a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
index 52c82975..bc2f5a04 100644
--- a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
+++ b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
@@ -159,12 +159,12 @@ def print_history_summary(model: art.TrainableModel, tail: int = 5) -> None:
 
     rows = pl.read_ndjson(str(history_path)).to_dicts()
 
-    train_rows = [row for row in rows if "train/reward" in row]
+    train_rows = [row for row in rows if "reward/mean" in row]
     print("\nRecent training metrics:")
     for row in train_rows[-tail:]:
         step = row["step"]
-        reward = row["train/reward"]
-        std_dev = row["train/reward_std_dev"]
+        reward = row["reward/mean"]
+        std_dev = row["reward/std_dev"]
         discarded = row["train/discarded_stale_samples"]
         off_policy = row["train/steps_off_policy"]
         print(
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 1870f933..89be518a 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -231,7 +231,7 @@ async def test_history_appends_entries(
         # Check both splits are present
         columns = df.columns
         assert any("val/" in col for col in columns)
-        assert any("train/" in col for col in columns)
+        assert any("reward/" in col for col in columns)
 
 
 class TestPathStructure:
@@ -500,6 +500,9 @@ async def test_train_trajectory_metrics_default_to_reward_prefix(
         with open(history_path) as f:
             entry = json.loads(f.readline())
 
+        assert entry["reward/mean"] == 0.7
+        assert entry["reward/exception_rate"] == 0.0
+        assert "train/reward" not in entry
         assert entry["reward/custom_score"] == 1.0
         assert entry["reward/prefixed"] == 2.0
 
@@ -719,8 +722,8 @@ async def mock_train_sft(*args, **kwargs):
         # Verify metrics are aggregated (averaged)
         entry = json.loads(lines[0])
         assert entry["step"] == 1
-        assert entry["train/loss"] == pytest.approx(0.8)  # (1.0 + 0.8 + 0.6) / 3
-        assert entry["train/grad_norm"] == pytest.approx(0.4)  # (0.5 + 0.4 + 0.3) / 3
+        assert entry["loss/train"] == pytest.approx(0.8)  # (1.0 + 0.8 + 0.6) / 3
+        assert entry["loss/grad_norm"] == pytest.approx(0.4)  # (0.5 + 0.4 + 0.3) / 3
 
     @pytest.mark.asyncio
     async def test_train_sft_single_step_increment(self, tmp_path: Path):

From 20f99674b0dacf0fa656a8b1334161713f739c40 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 17:31:50 -0800
Subject: [PATCH 07/46] feat: Persist MetricsBuilder cumulative state across
 resume

---
 src/art/model.py                    | 21 ++++++++++++++++
 tests/unit/test_frontend_logging.py | 39 +++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/src/art/model.py b/src/art/model.py
index eece4e9f..6f6cd470 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -30,6 +30,7 @@
 
 COSTS_METRIC_PREFIX = "costs_"
 COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total"
+METRICS_BUILDER_STATE_KEY = "_metrics_builder_state"
 METRIC_SECTIONS = frozenset(
     {
         "reward",
@@ -128,6 +129,7 @@ class Model(
     _wandb_run: Optional["Run"] = None  # Private, for lazy wandb initialization
     _run_start_time: float
     _metrics_builder: MetricsBuilder
+    _metrics_builder_state_loaded: bool
     _cost_calculator: CostCalculator
 
     def __init__(
@@ -159,6 +161,7 @@ def __init__(
         )
         object.__setattr__(self, "_run_start_time", time.time())
         object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train"))
+        object.__setattr__(self, "_metrics_builder_state_loaded", False)
 
     @overload
     def __new__(
@@ -504,6 +507,20 @@ def _extract_non_cost_metrics(
             non_cost_metrics[routed_metric] = numeric_value
         return non_cost_metrics
 
+    def _load_metrics_builder_state(self) -> None:
+        if self._metrics_builder_state_loaded:
+            return
+        state = self.read_state() or {}
+        metrics_state = state.get(METRICS_BUILDER_STATE_KEY)
+        if isinstance(metrics_state, dict):
+            self._metrics_builder.load_state_dict(metrics_state)
+        object.__setattr__(self, "_metrics_builder_state_loaded", True)
+
+    def _persist_metrics_builder_state(self) -> None:
+        self.merge_state(
+            {METRICS_BUILDER_STATE_KEY: self._metrics_builder.state_dict()}
+        )
+
     @staticmethod
     def _rename_train_metric_key(metric: str, split: str) -> str:
         if split != "train":
@@ -542,6 +559,8 @@ async def log(
         if step is None:
             step = await self.get_step() if self.trainable else 0
 
+        self._load_metrics_builder_state()
+
         # If only metrics provided (no trajectories), just log them and return
         if trajectories is None:
             if metrics is not None:
@@ -551,6 +570,7 @@ async def log(
                 costs = await self._metrics_builder.flush(step)
                 if costs:
                     self._log_metrics(costs, split, step)
+                self._persist_metrics_builder_state()
             return
 
         # Convert to list[TrajectoryGroup]
@@ -660,6 +680,7 @@ async def log(
         costs = await self._metrics_builder.flush(step)
         if costs:
             self._log_metrics(costs, split, step)
+        self._persist_metrics_builder_state()
 
     async def get_step(self) -> int:
         """
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 89be518a..4d0415fc 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -549,6 +549,45 @@ async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path):
         assert second["costs/train_cum"] == pytest.approx(0.6)
         assert second["costs/all_cum"] == pytest.approx(0.6)
 
+    @pytest.mark.asyncio
+    async def test_cost_cumulative_persists_across_model_recreation(
+        self, tmp_path: Path
+    ):
+        model_1 = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        await model_1.log(
+            trajectories=None,
+            split="train",
+            step=1,
+            metrics={"costs_prefill": 0.25},
+        )
+
+        model_2 = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        await model_2.log(
+            trajectories=None,
+            split="train",
+            step=2,
+            metrics={"costs_prefill": 0.75},
+        )
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            first = json.loads(f.readline())
+            second = json.loads(f.readline())
+
+        assert first["costs/train/prefill_cum"] == pytest.approx(0.25)
+        assert second["costs/train/prefill_cum"] == pytest.approx(1.0)
+        assert second["costs/all_cum"] == pytest.approx(1.0)
+
 
 class TestWandbIntegration:
     """Test wandb integration logic (without mocking wandb itself)."""

From 3e5ab1b66d47c73f29492c29a2ff005d878677cf Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 17:42:55 -0800
Subject: [PATCH 08/46] feat: Emit canonical train metric keys at source

---
 src/art/local/backend.py            | 14 ++++++-----
 src/art/megatron/train.py           |  4 ++--
 src/art/metrics_taxonomy.py         | 31 ++++++++++++++++++++++++
 src/art/model.py                    | 37 +++--------------------------
 src/art/serverless/backend.py       | 19 ++++++++++++---
 src/art/tinker/service.py           |  2 +-
 src/art/tinker_native/backend.py    | 15 +++++++-----
 src/art/unsloth/service.py          | 12 +++++-----
 src/art/unsloth/train.py            | 16 +++++++------
 tests/unit/test_frontend_logging.py | 18 +++++++++++---
 10 files changed, 100 insertions(+), 68 deletions(-)
 create mode 100644 src/art/metrics_taxonomy.py

diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index b74c0b05..aecef80a 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -39,6 +39,7 @@
 
 from .. import dev
 from ..backend import AnyTrainableModel, Backend
+from ..metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY, rename_train_metrics
 from ..model import Model, TrainableModel
 from ..preprocessing.pack import (
     PackedTensors,
@@ -579,7 +580,7 @@ async def train(  # type: ignore[override]
                 k: sum(d.get(k, 0) for d in training_metrics)
                 / sum(1 for d in training_metrics if k in d)
                 for k in {k for d in training_metrics for k in d}
-                if k != "num_gradient_steps"
+                if k != TRAIN_GRADIENT_STEPS_KEY
             }
 
         # Get step and checkpoint path
@@ -686,9 +687,9 @@ async def _train_model(
             # Yield metrics showing no groups were trainable
             # (the frontend will handle logging)
             yield {
-                "num_groups_submitted": num_groups_submitted,
-                "num_groups_trainable": 0,
-                "num_gradient_steps": 0,
+                "train/num_groups_submitted": float(num_groups_submitted),
+                "train/num_groups_trainable": 0.0,
+                TRAIN_GRADIENT_STEPS_KEY: 0.0,
             }
             return
         disk_packed_tensors = packed_tensors_to_dir(
@@ -701,14 +702,15 @@ async def _train_model(
         async for result in service.train(
             disk_packed_tensors, config, dev_config, verbose
         ):
+            result = rename_train_metrics(result)
             num_gradient_steps = int(
-                result.pop("num_gradient_steps", estimated_gradient_steps)
+                result.pop(TRAIN_GRADIENT_STEPS_KEY, estimated_gradient_steps)
             )
             assert num_gradient_steps == estimated_gradient_steps, (
                 f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}"
             )
             results.append(result)
-            yield {**result, "num_gradient_steps": num_gradient_steps}
+            yield {**result, TRAIN_GRADIENT_STEPS_KEY: float(num_gradient_steps)}
             pbar.update(1)
             pbar.set_postfix(result)
         pbar.close()
diff --git a/src/art/megatron/train.py b/src/art/megatron/train.py
index 876611a6..3441a0c4 100644
--- a/src/art/megatron/train.py
+++ b/src/art/megatron/train.py
@@ -282,8 +282,8 @@ def print0(*values: Any) -> None:
             with open("/tmp/megatron_training_log.jsonl", "a+") as log_file:
                 log_msg = json.dumps(
                     {
-                        "loss": loss.item(),
-                        "grad_norm": grad_norm,
+                        "loss/train": loss.item(),
+                        "loss/grad_norm": grad_norm,
                         "probs_corr": probs_corr,
                     }
                 )
diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py
new file mode 100644
index 00000000..e7d108d4
--- /dev/null
+++ b/src/art/metrics_taxonomy.py
@@ -0,0 +1,31 @@
+TRAIN_GRADIENT_STEPS_KEY = "data/step_num_gradient_steps"
+
+TRAIN_METRIC_KEY_RENAMES = {
+    "reward": "reward/mean",
+    "reward_std_dev": "reward/std_dev",
+    "exception_rate": "reward/exception_rate",
+    "policy_loss": "loss/train",
+    "loss": "loss/train",
+    "entropy": "loss/entropy",
+    "kl_div": "loss/kl_div",
+    "kl_policy_ref": "loss/kl_policy_ref",
+    "grad_norm": "loss/grad_norm",
+    "learning_rate": "loss/learning_rate",
+    "tokens_per_second": "throughput/train_tok_per_sec",
+    "num_groups_submitted": "train/num_groups_submitted",
+    "num_groups_trainable": "train/num_groups_trainable",
+    "num_trajectories": "train/num_trajectories",
+    "num_trainable_tokens": "train/num_trainable_tokens",
+    "train_tokens": "data/step_trainer_tokens",
+    "num_datums": "data/step_num_datums",
+}
+
+
+def rename_train_metric_key(metric: str) -> str:
+    if metric.startswith("group_metric_"):
+        return f"reward/group_{metric[len('group_metric_'):]}"
+    return TRAIN_METRIC_KEY_RENAMES.get(metric, metric)
+
+
+def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]:
+    return {rename_train_metric_key(key): float(value) for key, value in metrics.items()}
diff --git a/src/art/model.py b/src/art/model.py
index 6f6cd470..afe0073d 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -15,6 +15,7 @@
 from . import dev
 from .costs import CostCalculator
 from .metrics import MetricsBuilder
+from .metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY
 from .trajectories import Trajectory, TrajectoryGroup
 from .types import TrainConfig, TrainSFTConfig
 from .utils.trajectory_logging import write_trajectory_groups_parquet
@@ -44,25 +45,6 @@
     }
 )
 METRIC_SPLITS = frozenset({"train", "val", "test"})
-TRAIN_METRIC_KEY_RENAMES = {
-    "reward": "reward/mean",
-    "reward_std_dev": "reward/std_dev",
-    "exception_rate": "reward/exception_rate",
-    "policy_loss": "loss/train",
-    "loss": "loss/train",
-    "entropy": "loss/entropy",
-    "kl_div": "loss/kl_div",
-    "kl_policy_ref": "loss/kl_policy_ref",
-    "grad_norm": "loss/grad_norm",
-    "learning_rate": "loss/learning_rate",
-    "tokens_per_second": "throughput/train_tok_per_sec",
-    "num_groups_submitted": "train/num_groups_submitted",
-    "num_groups_trainable": "train/num_groups_trainable",
-    "num_trajectories": "train/num_trajectories",
-    "num_trainable_tokens": "train/num_trainable_tokens",
-    "train_tokens": "data/step_trainer_tokens",
-    "num_datums": "data/step_num_datums",
-}
 
 
 class Model(
@@ -503,8 +485,7 @@ def _extract_non_cost_metrics(
                         f"{cost_context}/{component}", numeric_value
                     )
                 continue
-            routed_metric = self._rename_train_metric_key(metric, split)
-            non_cost_metrics[routed_metric] = numeric_value
+            non_cost_metrics[metric] = numeric_value
         return non_cost_metrics
 
     def _load_metrics_builder_state(self) -> None:
@@ -521,14 +502,6 @@ def _persist_metrics_builder_state(self) -> None:
             {METRICS_BUILDER_STATE_KEY: self._metrics_builder.state_dict()}
         )
 
-    @staticmethod
-    def _rename_train_metric_key(metric: str, split: str) -> str:
-        if split != "train":
-            return metric
-        if metric.startswith("group_metric_"):
-            return f"reward/group_{metric[len('group_metric_'):]}"
-        return TRAIN_METRIC_KEY_RENAMES.get(metric, metric)
-
     async def log(
         self,
         trajectories: (
@@ -913,7 +886,7 @@ async def train(
                 k: sum(d.get(k, 0) for d in training_metrics)
                 / sum(1 for d in training_metrics if k in d)
                 for k in {k for d in training_metrics for k in d}
-                if k != "num_gradient_steps"
+                if k != TRAIN_GRADIENT_STEPS_KEY
             }
 
         # 3. Log trajectories and training metrics together (single wandb log call)
@@ -961,10 +934,6 @@ async def train_sft(
                 / sum(1 for d in training_metrics if k in d)
                 for k in {k for d in training_metrics for k in d}
             }
-            avg_metrics = {
-                self._rename_train_metric_key(key, "train"): value
-                for key, value in avg_metrics.items()
-            }
             # Get the current step after training
             step = await self.get_step()
             self._log_metrics(avg_metrics, "train", step)
diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py
index abf67f69..f9ab8c5f 100644
--- a/src/art/serverless/backend.py
+++ b/src/art/serverless/backend.py
@@ -9,6 +9,7 @@
 
 from .. import dev
 from ..backend import AnyTrainableModel, Backend
+from ..metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY, rename_train_metrics
 from ..trajectories import Trajectory, TrajectoryGroup
 from ..types import ServerlessTrainResult, TrainConfig, TrainSFTConfig
 from ..utils.record_provenance import record_provenance
@@ -247,7 +248,7 @@ async def train(  # type: ignore[override]
                 k: sum(d.get(k, 0) for d in training_metrics)
                 / sum(1 for d in training_metrics if k in d)
                 for k in {k for d in training_metrics for k in d}
-                if k != "num_gradient_steps"
+                if k != TRAIN_GRADIENT_STEPS_KEY
             }
 
         # Get step and artifact name
@@ -307,7 +308,13 @@ async def _train_model(
                     assert pbar is not None and num_sequences is not None
                     pbar.update(1)
                     pbar.set_postfix(event.data)
-                    yield {**event.data, "num_gradient_steps": num_sequences}
+                    metrics = rename_train_metrics(
+                        {k: float(v) for k, v in event.data.items()}
+                    )
+                    yield {
+                        **metrics,
+                        TRAIN_GRADIENT_STEPS_KEY: float(num_sequences),
+                    }
                 elif event.type == "training_started":
                     num_sequences = event.data["num_sequences"]
                     if pbar is None:
@@ -472,7 +479,13 @@ async def _train_sft(
                     assert pbar is not None and num_batches is not None
                     pbar.update(1)
                     pbar.set_postfix(event.data)
-                    yield {**event.data, "num_gradient_steps": num_batches}
+                    metrics = rename_train_metrics(
+                        {k: float(v) for k, v in event.data.items()}
+                    )
+                    yield {
+                        **metrics,
+                        TRAIN_GRADIENT_STEPS_KEY: float(num_batches),
+                    }
                 elif event.type == "training_started":
                     num_batches = event.data.get("num_sequences", 0)
                     if pbar is None:
diff --git a/src/art/tinker/service.py b/src/art/tinker/service.py
index ba6768eb..1f5970ac 100644
--- a/src/art/tinker/service.py
+++ b/src/art/tinker/service.py
@@ -80,7 +80,7 @@ def custom_loss_fn(
             for mask, lp in zip(masks, logprobs_list):
                 logprobs[mask] = lp
             loss = loss_fn(inputs, logprobs.unsqueeze(0), None, None, _config)
-            return loss.mean_policy_loss, {"policy_loss": loss.mean_policy_loss.item()}
+            return loss.mean_policy_loss, {"loss/train": loss.mean_policy_loss.item()}
 
         shifted_tokens = shift_tensor(packed_tensors["tokens"], 0)
 
diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py
index e5eb1180..19df73dd 100644
--- a/src/art/tinker_native/backend.py
+++ b/src/art/tinker_native/backend.py
@@ -30,6 +30,7 @@
 from .. import dev
 from ..backend import Backend
 from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing
+from ..metrics_taxonomy import rename_train_metric_key
 from ..model import Model, TrainableModel
 from ..tinker.backend import get_renderer_name
 from ..tinker.server import get_free_port
@@ -217,8 +218,8 @@ async def train(  # type: ignore[override]
         )
 
         metrics: dict[str, float] = {
-            "num_groups_submitted": float(len(groups_list)),
-            "num_datums": float(len(datums)),
+            "train/num_groups_submitted": float(len(groups_list)),
+            "data/step_num_datums": float(len(datums)),
         }
 
         if not datums:
@@ -227,10 +228,12 @@ async def train(  # type: ignore[override]
         train_tokens = 0
         for datum in datums:
             train_tokens += len(datum.model_input.to_ints())
-        metrics["train_tokens"] = float(train_tokens)
+        metrics["data/step_trainer_tokens"] = float(train_tokens)
         pricing = get_model_pricing(model.base_model)
         if pricing is not None:
-            metrics["costs_train"] = compute_train_cost(train_tokens, pricing)
+            metrics["costs/train/tinker_train"] = compute_train_cost(
+                train_tokens, pricing
+            )
 
         if adam_params is None:
             adam_params = tinker.AdamParams(
@@ -268,12 +271,12 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum:
             for key, value in forward_output.metrics.items():
                 if value is None:
                     continue
-                metrics[key] = float(value)
+                metrics[rename_train_metric_key(key)] = float(value)
         if optim_output.metrics:
             for key, value in optim_output.metrics.items():
                 if value is None:
                     continue
-                metrics[key] = float(value)
+                metrics[rename_train_metric_key(key)] = float(value)
 
         next_step = state.current_step + 1
         checkpoint_name = f"step_{next_step:06d}"
diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py
index 94d01b78..f3a69179 100644
--- a/src/art/unsloth/service.py
+++ b/src/art/unsloth/service.py
@@ -844,12 +844,12 @@ async def train_sft(
             batch_idx += 1
 
             yield {
-                "loss": batch_loss,
-                "learning_rate": batch.learning_rate,
-                "grad_norm": grad_norm,
-                "num_trajectories": float(batch.num_trajectories),
-                "num_trainable_tokens": float(batch.num_trainable_tokens),
-                "tokens_per_second": tokens_per_second,
+                "loss/train": batch_loss,
+                "loss/learning_rate": batch.learning_rate,
+                "loss/grad_norm": grad_norm,
+                "train/num_trajectories": float(batch.num_trajectories),
+                "train/num_trainable_tokens": float(batch.num_trainable_tokens),
+                "throughput/train_tok_per_sec": tokens_per_second,
             }
 
         # === Cleanup ===
diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py
index 34dbc5cd..f095fe35 100644
--- a/src/art/unsloth/train.py
+++ b/src/art/unsloth/train.py
@@ -12,6 +12,7 @@
 
 from .. import dev
 from ..loss import loss_fn, shift_tensor
+from ..metrics_taxonomy import rename_train_metrics
 from ..types import TrainConfig
 
 if TYPE_CHECKING:
@@ -169,14 +170,16 @@ def compute_loss(
             _config,
         )
 
-        trainer._metrics["train"]["learning_rate"].append(config.learning_rate)
-        trainer._metrics["train"]["policy_loss"].append(loss.mean_policy_loss.item())
+        trainer._metrics["train"]["loss/learning_rate"].append(config.learning_rate)
+        trainer._metrics["train"]["loss/train"].append(loss.mean_policy_loss.item())
         if loss.mean_entropy is not None:
-            trainer._metrics["train"]["entropy"].append(loss.mean_entropy.item())
+            trainer._metrics["train"]["loss/entropy"].append(loss.mean_entropy.item())
         if config.beta > 0.0:
-            trainer._metrics["train"]["kl_div"].append(loss.mean_kl.item())
+            trainer._metrics["train"]["loss/kl_div"].append(loss.mean_kl.item())
         if loss.kl_policy_ref is not None:
-            trainer._metrics["train"]["kl_policy_ref"].append(loss.kl_policy_ref.item())
+            trainer._metrics["train"]["loss/kl_policy_ref"].append(
+                loss.kl_policy_ref.item()
+            )
         return loss.mean_policy_loss + config.beta * loss.mean_kl
 
     return compute_loss
@@ -195,8 +198,7 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None:
         if next(iter(logs.keys())).startswith("eval_"):
             metrics = {f"eval_{key}": val for key, val in metrics.items()}
 
-        logs = {**logs, **metrics}
-        logs.pop("learning_rate", None)
+        logs = {**rename_train_metrics(logs), **metrics}
         results_queue.put_nowait(logs)
         trainer._metrics["train"].clear()
 
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 4d0415fc..c5feeefc 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -726,9 +726,21 @@ async def test_train_sft_aggregates_metrics(self, tmp_path: Path):
 
         async def mock_train_sft(*args, **kwargs):
             # Simulate 3 batches with different metrics
-            yield {"loss": 1.0, "learning_rate": 1e-4, "grad_norm": 0.5}
-            yield {"loss": 0.8, "learning_rate": 1e-4, "grad_norm": 0.4}
-            yield {"loss": 0.6, "learning_rate": 1e-4, "grad_norm": 0.3}
+            yield {
+                "loss/train": 1.0,
+                "loss/learning_rate": 1e-4,
+                "loss/grad_norm": 0.5,
+            }
+            yield {
+                "loss/train": 0.8,
+                "loss/learning_rate": 1e-4,
+                "loss/grad_norm": 0.4,
+            }
+            yield {
+                "loss/train": 0.6,
+                "loss/learning_rate": 1e-4,
+                "loss/grad_norm": 0.3,
+            }
 
         mock_backend._train_sft = mock_train_sft
         mock_backend._get_step = AsyncMock(return_value=1)  # Step after training

From 1ba0931603c786b1eeb4ddc458584a546a7c40a9 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 17:42:59 -0800
Subject: [PATCH 09/46] docs: Add metrics taxonomy guide and smoke example

---
 docs/metrics-taxonomy.md           | 58 ++++++++++++++++++++++
 examples/metrics_taxonomy_smoke.py | 78 ++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 docs/metrics-taxonomy.md
 create mode 100644 examples/metrics_taxonomy_smoke.py

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
new file mode 100644
index 00000000..1bbe3373
--- /dev/null
+++ b/docs/metrics-taxonomy.md
@@ -0,0 +1,58 @@
+# Metrics Taxonomy (Phase 1)
+
+Phase 1 introduces sectioned metric namespaces and hierarchical cost rollups.
+
+## Sections
+
+- `reward/*`
+- `loss/*`
+- `throughput/*`
+- `costs/*`
+- `time/*`
+- `data/*`
+- `train/*`, `val/*`, `test/*`
+
+## Train Key Mapping
+
+Current training code emits the following canonical keys:
+
+- `reward` -> `reward/mean`
+- `reward_std_dev` -> `reward/std_dev`
+- `exception_rate` -> `reward/exception_rate`
+- `group_metric_<k>` -> `reward/group_<k>`
+- `policy_loss` / `loss` -> `loss/train`
+- `entropy` -> `loss/entropy`
+- `kl_div` -> `loss/kl_div`
+- `kl_policy_ref` -> `loss/kl_policy_ref`
+- `grad_norm` -> `loss/grad_norm`
+- `learning_rate` -> `loss/learning_rate`
+- `tokens_per_second` -> `throughput/train_tok_per_sec`
+- `num_groups_submitted` -> `train/num_groups_submitted`
+- `num_groups_trainable` -> `train/num_groups_trainable`
+- `num_trajectories` -> `train/num_trajectories`
+- `num_trainable_tokens` -> `train/num_trainable_tokens`
+- `train_tokens` -> `data/step_trainer_tokens`
+- `num_datums` -> `data/step_num_datums`
+- `num_gradient_steps` -> `data/step_num_gradient_steps`
+
+## Cost Rollups
+
+Cost leaves can be logged with either:
+
+- hierarchical keys, e.g. `costs/train/llm_judge/correctness`
+- legacy component keys, e.g. `costs_prefill`, `costs_sample`
+
+ART rolls costs up automatically:
+
+- parent rollups (for example `costs/train`, `costs/all`)
+- cumulative keys with `_cum` suffix (for example `costs/all_cum`)
+
+## End-to-End Smoke Test
+
+Run:
+
+```bash
+uv run python examples/metrics_taxonomy_smoke.py
+```
+
+This writes a local history file and, if `WANDB_API_KEY` is set, logs the same metrics to W&B.
diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py
new file mode 100644
index 00000000..25e9de1c
--- /dev/null
+++ b/examples/metrics_taxonomy_smoke.py
@@ -0,0 +1,78 @@
+import asyncio
+import json
+import os
+from pathlib import Path
+import time
+
+import art
+
+
+async def main() -> None:
+    project = os.environ.get("ART_METRICS_PROJECT", "metrics-taxonomy-smoke")
+    model_name = os.environ.get(
+        "ART_METRICS_MODEL", f"metrics-smoke-{int(time.time())}"
+    )
+    base_path = os.environ.get("ART_METRICS_BASE_PATH", ".art")
+
+    model = art.Model(
+        name=model_name,
+        project=project,
+        base_path=base_path,
+        report_metrics=["wandb"],
+    )
+
+    for step in (1, 2):
+        trajectories = [
+            art.TrajectoryGroup(
+                trajectories=[
+                    art.Trajectory(
+                        reward=0.4 + 0.1 * step,
+                        metrics={
+                            "judge_quality": 0.7 + 0.05 * step,
+                            "reward/custom_prefixed": 0.2 * step,
+                        },
+                        messages_and_choices=[
+                            {"role": "user", "content": f"smoke step {step}"},
+                            {"role": "assistant", "content": "ok"},
+                        ],
+                    )
+                ],
+                exceptions=[],
+            )
+        ]
+
+        await model.log(
+            trajectories,
+            split="train",
+            step=step,
+            metrics={
+                "loss/train": 1.0 / step,
+                "loss/grad_norm": 0.5 + 0.1 * step,
+                "throughput/train_tok_per_sec": 1000.0 + 100.0 * step,
+                "time/step_wall_s": 1.5 + 0.2 * step,
+                "data/step_num_scenarios": 2.0,
+                "data/step_actor_tokens": 120.0 + 10.0 * step,
+                "costs_prefill": 0.10 * step,
+                "costs_sample": 0.05 * step,
+                "costs/train/llm_judge/correctness": 0.02 * step,
+            },
+        )
+
+    history_path = Path(base_path) / project / "models" / model_name / "history.jsonl"
+    print(f"Wrote history: {history_path}")
+
+    with open(history_path) as f:
+        rows = [json.loads(line) for line in f]
+
+    print("\nLast row key excerpts:")
+    last = rows[-1]
+    show_prefixes = ("reward/", "loss/", "throughput/", "time/", "data/", "costs/")
+    for key in sorted(last):
+        if key.startswith(show_prefixes):
+            print(f"{key}: {last[key]}")
+
+    print("\nIf WANDB_API_KEY is set, metrics are also logged to W&B.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 75068fdf31a0b6a460d34af3903105df501a03e6 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 17:50:55 -0800
Subject: [PATCH 10/46] fix: Bind nested cost metrics to training_step in W&B

---
 src/art/model.py                  | 22 ++++++++++++++++++
 tests/unit/test_metric_routing.py | 38 +++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/src/art/model.py b/src/art/model.py
index afe0073d..d3bec930 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -109,6 +109,7 @@ class Model(
     _s3_prefix: str | None = None
     _openai_client: AsyncOpenAI | None = None
     _wandb_run: Optional["Run"] = None  # Private, for lazy wandb initialization
+    _wandb_defined_metrics: set[str]
     _run_start_time: float
     _metrics_builder: MetricsBuilder
     _metrics_builder_state_loaded: bool
@@ -141,6 +142,7 @@ def __init__(
             report_metrics=report_metrics,
             **kwargs,
         )
+        object.__setattr__(self, "_wandb_defined_metrics", set())
         object.__setattr__(self, "_run_start_time", time.time())
         object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train"))
         object.__setattr__(self, "_metrics_builder_state_loaded", False)
@@ -397,6 +399,14 @@ def _get_wandb_run(self) -> Optional["Run"]:
                 ),
             )
             self._wandb_run = run
+            object.__setattr__(
+                self,
+                "_wandb_defined_metrics",
+                {
+                    "training_step",
+                    "time/wall_clock_sec",
+                },
+            )
 
             # Define training_step as the x-axis for all metrics.
             # This allows out-of-order logging (e.g., async validation for previous steps).
@@ -461,8 +471,20 @@ def _log_metrics(
         ) or (self.report_metrics is not None and "wandb" in self.report_metrics)
         if should_log_wandb:
             if run := self._get_wandb_run():
+                self._define_wandb_step_metrics(prefixed.keys())
                 run.log(prefixed)
 
+    def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None:
+        import wandb
+
+        for key in keys:
+            if not key.startswith("costs/"):
+                continue
+            if key in self._wandb_defined_metrics:
+                continue
+            wandb.define_metric(key, step_metric="training_step")
+            self._wandb_defined_metrics.add(key)
+
     def _extract_non_cost_metrics(
         self, metrics: dict[str, float], split: str
     ) -> dict[str, float]:
diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py
index e83a48ed..2587385d 100644
--- a/tests/unit/test_metric_routing.py
+++ b/tests/unit/test_metric_routing.py
@@ -74,3 +74,41 @@ def test_get_wandb_run_registers_taxonomy_sections(self, tmp_path: Path) -> None
             (("val/*",), {"step_metric": "training_step"}),
             (("test/*",), {"step_metric": "training_step"}),
         ]
+
+    def test_log_metrics_defines_nested_cost_keys_with_training_step(
+        self, tmp_path: Path
+    ) -> None:
+        fake_run = MagicMock()
+        fake_run._is_finished = False
+
+        fake_wandb = types.SimpleNamespace()
+        fake_wandb.init = MagicMock(return_value=fake_run)
+        fake_wandb.define_metric = MagicMock()
+        fake_wandb.Settings = lambda **kwargs: kwargs
+
+        with patch.dict(os.environ, {"WANDB_API_KEY": "test-key"}, clear=False):
+            with patch.dict("sys.modules", {"wandb": fake_wandb}):
+                model = Model(
+                    name="test-model",
+                    project="test-project",
+                    base_path=str(tmp_path),
+                    report_metrics=["wandb"],
+                )
+                model._log_metrics(
+                    {
+                        "costs/train/sample": 0.1,
+                        "costs/train/prefill_cum": 0.2,
+                    },
+                    split="train",
+                    step=1,
+                )
+
+        define_calls = [
+            (call.args, call.kwargs)
+            for call in fake_wandb.define_metric.call_args_list
+        ]
+        assert (("costs/train/sample",), {"step_metric": "training_step"}) in define_calls
+        assert (
+            (("costs/train/prefill_cum",), {"step_metric": "training_step"})
+            in define_calls
+        )

From f958e3ca4d72e7aa65f215ed3780c92ad618611e Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 18:23:53 -0800
Subject: [PATCH 11/46] feat: Add API cost decorator and metrics context wiring

---
 src/art/metrics.py                  | 335 +++++++++++++++++++++++++++-
 src/art/model.py                    |  11 +
 src/art/pipeline_trainer/trainer.py |  12 +-
 3 files changed, 350 insertions(+), 8 deletions(-)

diff --git a/src/art/metrics.py b/src/art/metrics.py
index 4ff89f00..623aadcf 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -1,26 +1,164 @@
 from __future__ import annotations
 
 import asyncio
+from collections.abc import Callable
+from contextlib import contextmanager
 from contextvars import ContextVar, Token
-from typing import Any
+from dataclasses import dataclass
+from functools import wraps
+from inspect import iscoroutinefunction
+from typing import Any, ParamSpec, TypeVar
+
+from .costs import tokens_to_cost
 
 _active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder")
 
 _HIERARCHICAL_SECTIONS = {"costs", "time", "data"}
+_DEFAULT_PROVIDER = "openai"
+_OPENAI_PROVIDER = "openai"
+_ANTHROPIC_PROVIDER = "anthropic"
+
+P = ParamSpec("P")
+R = TypeVar("R")
+
+
+CostExtractor = Callable[[Any], float | None]
+ResponseGetter = Callable[[Any], Any]
+
+
+@dataclass(frozen=True)
+class TokenPricing:
+    prompt_per_million: float
+    completion_per_million: float
+
+
+_DEFAULT_TOKEN_PRICING = {
+    _OPENAI_PROVIDER: TokenPricing(prompt_per_million=2.5, completion_per_million=10.0),
+    _ANTHROPIC_PROVIDER: TokenPricing(
+        prompt_per_million=3.0, completion_per_million=15.0
+    ),
+}
+
+
+@dataclass
+class _SharedMetricsState:
+    lock: asyncio.Lock
+    step_buffer: dict[str, float]
+    cum_state: dict[str, float]
+    unique_scenario_ids: set[str]
+    cost_extractors: dict[str, CostExtractor]
+    token_pricing: dict[str, TokenPricing]
+
+
+def _new_shared_metrics_state() -> _SharedMetricsState:
+    return _SharedMetricsState(
+        lock=asyncio.Lock(),
+        step_buffer={},
+        cum_state={},
+        unique_scenario_ids=set(),
+        cost_extractors={},
+        token_pricing=dict(_DEFAULT_TOKEN_PRICING),
+    )
+
+
+def _normalize_provider(provider: str | None) -> str | None:
+    if provider is None:
+        return None
+    normalized = provider.strip().lower()
+    if not normalized:
+        return None
+    return normalized
+
+
+def _read_usage_field(usage: Any, field: str) -> float | None:
+    if usage is None:
+        return None
+    if isinstance(usage, dict):
+        value = usage.get(field)
+    else:
+        value = getattr(usage, field, None)
+    if value is None:
+        return None
+    return float(value)
+
+
+def _response_usage(response: Any) -> Any:
+    if isinstance(response, dict):
+        return response.get("usage")
+    return getattr(response, "usage", None)
+
+
+def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None:
+    usage = _response_usage(response)
+    prompt_tokens = _read_usage_field(usage, "prompt_tokens")
+    completion_tokens = _read_usage_field(usage, "completion_tokens")
+    if prompt_tokens is None and completion_tokens is None:
+        return None
+    return prompt_tokens or 0.0, completion_tokens or 0.0
+
+
+def _extract_anthropic_token_counts(response: Any) -> tuple[float, float] | None:
+    usage = _response_usage(response)
+    input_tokens = _read_usage_field(usage, "input_tokens")
+    output_tokens = _read_usage_field(usage, "output_tokens")
+    if input_tokens is None and output_tokens is None:
+        return None
+    return input_tokens or 0.0, output_tokens or 0.0
+
+
+def _detect_provider(response: Any) -> str | None:
+    usage = _response_usage(response)
+    if usage is None:
+        return None
+
+    if (
+        _read_usage_field(usage, "prompt_tokens") is not None
+        or _read_usage_field(usage, "completion_tokens") is not None
+    ):
+        return _OPENAI_PROVIDER
+    if (
+        _read_usage_field(usage, "input_tokens") is not None
+        or _read_usage_field(usage, "output_tokens") is not None
+    ):
+        return _ANTHROPIC_PROVIDER
+    return None
+
+
+def _estimate_cost(
+    token_counts: tuple[float, float] | None,
+    pricing: TokenPricing,
+) -> float | None:
+    if token_counts is None:
+        return None
+    prompt_tokens, completion_tokens = token_counts
+    return tokens_to_cost(prompt_tokens, pricing.prompt_per_million) + tokens_to_cost(
+        completion_tokens,
+        pricing.completion_per_million,
+    )
 
 
 class MetricsBuilder:
     """Build and accumulate step-level metrics for logging."""
 
-    def __init__(self, cost_context: str) -> None:
+    def __init__(
+        self,
+        cost_context: str,
+        *,
+        _shared_state: _SharedMetricsState | None = None,
+    ) -> None:
         if not cost_context:
             raise ValueError("cost_context must be non-empty")
 
         self.cost_context = cost_context
-        self._lock = asyncio.Lock()
-        self._step_buffer: dict[str, float] = {}
-        self._cum_state: dict[str, float] = {}
-        self._unique_scenario_ids: set[str] = set()
+        self._shared_state = (
+            _shared_state if _shared_state is not None else _new_shared_metrics_state()
+        )
+        self._lock = self._shared_state.lock
+        self._step_buffer = self._shared_state.step_buffer
+        self._cum_state = self._shared_state.cum_state
+        self._unique_scenario_ids = self._shared_state.unique_scenario_ids
+        self._cost_extractors = self._shared_state.cost_extractors
+        self._token_pricing = self._shared_state.token_pricing
 
     def add_cost(self, path: str, usd: float) -> None:
         if not path:
@@ -99,10 +237,52 @@ async def flush(self, step: int) -> dict[str, float]:
     def activate(self) -> Token["MetricsBuilder"]:
         return _active_builder.set(self)
 
+    @contextmanager
+    def activate_context(self):
+        token = self.activate()
+        try:
+            yield self
+        finally:
+            token.var.reset(token)
+
     @staticmethod
     def get_active() -> "MetricsBuilder":
         return _active_builder.get()
 
+    def for_cost_context(self, cost_context: str) -> "MetricsBuilder":
+        normalized_cost_context = cost_context.strip()
+        if not normalized_cost_context:
+            raise ValueError("cost_context must be non-empty")
+        if normalized_cost_context == self.cost_context:
+            return self
+        return MetricsBuilder(
+            cost_context=normalized_cost_context,
+            _shared_state=self._shared_state,
+        )
+
+    def register_cost_extractor(
+        self, provider: str, extractor: CostExtractor
+    ) -> None:
+        normalized_provider = _normalize_provider(provider)
+        if normalized_provider is None:
+            raise ValueError("provider must be non-empty")
+        self._cost_extractors[normalized_provider] = extractor
+
+    def register_token_pricing(
+        self,
+        provider: str,
+        *,
+        prompt_per_million: float,
+        completion_per_million: float,
+    ) -> None:
+        normalized_provider = _normalize_provider(provider)
+        if normalized_provider is None:
+            raise ValueError("provider must be non-empty")
+        self._token_pricing[normalized_provider] = TokenPricing(
+            prompt_per_million=float(prompt_per_million),
+            completion_per_million=float(completion_per_million),
+        )
+
     def state_dict(self) -> dict[str, Any]:
         return {
             "cum_state": dict(self._cum_state),
@@ -172,3 +352,146 @@ def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]:
         rollups["costs/all"] = costs_all
 
         return rollups
+
+    def _resolve_token_pricing(
+        self,
+        provider: str | None,
+        *,
+        prompt_price_per_million: float | None = None,
+        completion_price_per_million: float | None = None,
+    ) -> TokenPricing:
+        normalized_provider = _normalize_provider(provider) or _DEFAULT_PROVIDER
+        default_pricing = self._token_pricing.get(
+            normalized_provider,
+            self._token_pricing[_DEFAULT_PROVIDER],
+        )
+        return TokenPricing(
+            prompt_per_million=(
+                float(prompt_price_per_million)
+                if prompt_price_per_million is not None
+                else default_pricing.prompt_per_million
+            ),
+            completion_per_million=(
+                float(completion_price_per_million)
+                if completion_price_per_million is not None
+                else default_pricing.completion_per_million
+            ),
+        )
+
+    def _extract_api_cost(
+        self,
+        response: Any,
+        *,
+        provider: str | None = None,
+        prompt_price_per_million: float | None = None,
+        completion_price_per_million: float | None = None,
+    ) -> float | None:
+        provider_name = _normalize_provider(provider) or _detect_provider(response)
+        if provider_name is not None:
+            custom_extractor = self._cost_extractors.get(provider_name)
+            if custom_extractor is not None:
+                custom_cost = custom_extractor(response)
+                if custom_cost is not None:
+                    return float(custom_cost)
+
+            token_pricing = self._resolve_token_pricing(
+                provider_name,
+                prompt_price_per_million=prompt_price_per_million,
+                completion_price_per_million=completion_price_per_million,
+            )
+            if provider_name == _OPENAI_PROVIDER:
+                return _estimate_cost(
+                    _extract_openai_token_counts(response),
+                    token_pricing,
+                )
+            if provider_name == _ANTHROPIC_PROVIDER:
+                return _estimate_cost(
+                    _extract_anthropic_token_counts(response),
+                    token_pricing,
+                )
+
+        token_pricing = self._resolve_token_pricing(
+            provider_name,
+            prompt_price_per_million=prompt_price_per_million,
+            completion_price_per_million=completion_price_per_million,
+        )
+        token_counts = _extract_openai_token_counts(response)
+        if token_counts is None:
+            token_counts = _extract_anthropic_token_counts(response)
+        return _estimate_cost(token_counts, token_pricing)
+
+
+def _record_api_cost(
+    *,
+    result: Any,
+    source: str,
+    provider: str | None,
+    response_getter: ResponseGetter | None,
+    prompt_price_per_million: float | None,
+    completion_price_per_million: float | None,
+) -> None:
+    try:
+        builder = MetricsBuilder.get_active()
+    except LookupError:
+        return
+
+    response = response_getter(result) if response_getter is not None else result
+    cost = builder._extract_api_cost(
+        response,
+        provider=provider,
+        prompt_price_per_million=prompt_price_per_million,
+        completion_price_per_million=completion_price_per_million,
+    )
+    if cost is None:
+        return
+    builder.add_cost(f"{builder.cost_context}/{source}", cost)
+
+
+def track_api_cost(
+    *,
+    source: str,
+    provider: str | None = None,
+    response_getter: ResponseGetter | None = None,
+    prompt_price_per_million: float | None = None,
+    completion_price_per_million: float | None = None,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    normalized_source = source.strip("/")
+    if not normalized_source:
+        raise ValueError("source must be non-empty")
+
+    normalized_provider = _normalize_provider(provider)
+
+    def _decorate(func: Callable[P, R]) -> Callable[P, R]:
+        if iscoroutinefunction(func):
+
+            @wraps(func)
+            async def _async_wrapper(*args: P.args, **kwargs: P.kwargs):
+                result = await func(*args, **kwargs)
+                _record_api_cost(
+                    result=result,
+                    source=normalized_source,
+                    provider=normalized_provider,
+                    response_getter=response_getter,
+                    prompt_price_per_million=prompt_price_per_million,
+                    completion_price_per_million=completion_price_per_million,
+                )
+                return result
+
+            return _async_wrapper
+
+        @wraps(func)
+        def _sync_wrapper(*args: P.args, **kwargs: P.kwargs):
+            result = func(*args, **kwargs)
+            _record_api_cost(
+                result=result,
+                source=normalized_source,
+                provider=normalized_provider,
+                response_getter=response_getter,
+                prompt_price_per_million=prompt_price_per_million,
+                completion_price_per_million=completion_price_per_million,
+            )
+            return result
+
+        return _sync_wrapper
+
+    return _decorate
diff --git a/src/art/model.py b/src/art/model.py
index d3bec930..91f9e81b 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -1,4 +1,5 @@
 import asyncio
+from contextvars import Token
 from datetime import datetime
 import json
 import os
@@ -510,6 +511,16 @@ def _extract_non_cost_metrics(
             non_cost_metrics[metric] = numeric_value
         return non_cost_metrics
 
+    def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder:
+        if cost_context is None:
+            return self._metrics_builder
+        return self._metrics_builder.for_cost_context(cost_context)
+
+    def activate_metrics_context(
+        self, cost_context: str
+    ) -> Token[MetricsBuilder]:
+        return self.metrics_builder(cost_context).activate()
+
     def _load_metrics_builder_state(self) -> None:
         if self._metrics_builder_state_loaded:
             return
diff --git a/src/art/pipeline_trainer/trainer.py b/src/art/pipeline_trainer/trainer.py
index a061636b..9dcec1cd 100644
--- a/src/art/pipeline_trainer/trainer.py
+++ b/src/art/pipeline_trainer/trainer.py
@@ -328,7 +328,11 @@ async def _rollout_worker(self, worker_id: int) -> None:
 
                 initial_version = self.state.policy_version
 
-                group = await self.rollout_fn(self.model, scenario, self.config)
+                token = self.model.activate_metrics_context("train")
+                try:
+                    group = await self.rollout_fn(self.model, scenario, self.config)
+                finally:
+                    token.var.reset(token)
                 if not isinstance(group, TrajectoryGroup):
                     errored = True
                     continue
@@ -562,7 +566,11 @@ async def _run_eval(self, step: int) -> None:
         self._status.note_val_started(step)
         reward: float | None = None
         try:
-            result = await self.eval_fn(self.model, step, self.config)
+            token = self.model.activate_metrics_context("eval")
+            try:
+                result = await self.eval_fn(self.model, step, self.config)
+            finally:
+                token.var.reset(token)
             splits: dict[str, list[art.Trajectory | art.TrajectoryGroup]]
             if isinstance(result, dict):
                 splits = result

From 7294638bbe7b7bd3f27bffd6db686a75ee192509 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 18:23:57 -0800
Subject: [PATCH 12/46] test: Add coverage for API cost decorator and context
 routing

---
 tests/unit/test_track_api_cost.py | 310 ++++++++++++++++++++++++++++++
 1 file changed, 310 insertions(+)
 create mode 100644 tests/unit/test_track_api_cost.py

diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
new file mode 100644
index 00000000..4cb6dd34
--- /dev/null
+++ b/tests/unit/test_track_api_cost.py
@@ -0,0 +1,310 @@
+import asyncio
+import json
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+from art import Model, TrainableModel, Trajectory, TrajectoryGroup
+from art.metrics import MetricsBuilder, track_api_cost
+from art.pipeline_trainer.trainer import PipelineTrainer
+
+
+class _OpenAIUsage:
+    def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
+        self.prompt_tokens = prompt_tokens
+        self.completion_tokens = completion_tokens
+
+
+class _OpenAIResponse:
+    def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
+        self.usage = _OpenAIUsage(prompt_tokens, completion_tokens)
+
+
+class _AnthropicUsage:
+    def __init__(self, input_tokens: int, output_tokens: int) -> None:
+        self.input_tokens = input_tokens
+        self.output_tokens = output_tokens
+
+
+class _AnthropicResponse:
+    def __init__(self, input_tokens: int, output_tokens: int) -> None:
+        self.usage = _AnthropicUsage(input_tokens, output_tokens)
+
+
+class TestTrackApiCost:
+    @pytest.mark.asyncio
+    async def test_openai_cost_extraction_with_explicit_pricing(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        @track_api_cost(
+            source="llm_judge/correctness",
+            provider="openai",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=100, completion_tokens=50)
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush(step=1)
+        assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002)
+
+    @pytest.mark.asyncio
+    async def test_anthropic_cost_extraction_uses_registered_pricing(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.register_token_pricing(
+            "anthropic",
+            prompt_per_million=5.0,
+            completion_per_million=7.0,
+        )
+
+        @track_api_cost(source="llm_judge/faithfulness")
+        async def _judge() -> _AnthropicResponse:
+            return _AnthropicResponse(input_tokens=40, output_tokens=60)
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush(step=1)
+        assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062)
+
+    @pytest.mark.asyncio
+    async def test_custom_extractor_takes_precedence(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.register_cost_extractor("openai", lambda _response: 0.75)
+
+        @track_api_cost(
+            source="llm_judge/custom",
+            provider="openai",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=1, completion_tokens=1)
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush(step=1)
+        assert metrics["costs/train/llm_judge/custom"] == pytest.approx(0.75)
+
+    @pytest.mark.asyncio
+    async def test_decorator_noops_without_active_builder(self) -> None:
+        @track_api_cost(source="llm_judge/no_context", provider="openai")
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=10, completion_tokens=20)
+
+        result = await _judge()
+        assert isinstance(result, _OpenAIResponse)
+
+    @pytest.mark.asyncio
+    async def test_for_cost_context_routes_to_eval_and_shares_state(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        eval_builder = builder.for_cost_context("eval")
+
+        @track_api_cost(
+            source="llm_judge/correctness",
+            provider="openai",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=100, completion_tokens=50)
+
+        token = eval_builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush(step=1)
+        assert metrics["costs/eval/llm_judge/correctness"] == pytest.approx(0.0002)
+
+
+class TestTrackApiCostIntegration:
+    @pytest.mark.asyncio
+    async def test_model_log_emits_train_and_eval_costs(self, tmp_path: Path) -> None:
+        model = Model(
+            name="metrics-cost-test",
+            project="metrics-cost-test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        @track_api_cost(
+            source="llm_judge/correctness",
+            provider="openai",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _train_judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=100, completion_tokens=50)
+
+        @track_api_cost(
+            source="llm_judge/factuality",
+            provider="anthropic",
+            prompt_price_per_million=3.0,
+            completion_price_per_million=4.0,
+        )
+        async def _eval_judge() -> _AnthropicResponse:
+            return _AnthropicResponse(input_tokens=40, output_tokens=10)
+
+        train_token = model.activate_metrics_context("train")
+        try:
+            await _train_judge()
+        finally:
+            train_token.var.reset(train_token)
+
+        await model.log(trajectories=None, split="train", step=1, metrics={})
+
+        eval_token = model.activate_metrics_context("eval")
+        try:
+            await _eval_judge()
+        finally:
+            eval_token.var.reset(eval_token)
+
+        await model.log(trajectories=None, split="val", step=2, metrics={})
+
+        history_path = (
+            tmp_path
+            / "metrics-cost-test"
+            / "models"
+            / "metrics-cost-test"
+            / "history.jsonl"
+        )
+        with open(history_path) as f:
+            first = json.loads(f.readline())
+            second = json.loads(f.readline())
+
+        assert first["costs/train/llm_judge/correctness"] == pytest.approx(0.0002)
+        assert second["costs/eval/llm_judge/factuality"] == pytest.approx(0.00016)
+        assert second["costs/all_cum"] == pytest.approx(0.00036)
+
+    @pytest.mark.asyncio
+    async def test_pipeline_trainer_activates_train_context_for_rollouts(
+        self, tmp_path: Path
+    ) -> None:
+        model = TrainableModel(
+            name="pipeline-context-test",
+            project="pipeline-context-test",
+            base_model="test-model",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        backend = MagicMock()
+        observed_contexts: list[str] = []
+
+        async def rollout_fn(
+            _model: TrainableModel,
+            _scenario: dict,
+            _config: dict,
+        ) -> TrajectoryGroup:
+            observed_contexts.append(MetricsBuilder.get_active().cost_context)
+            return TrajectoryGroup(
+                [
+                    Trajectory(
+                        reward=1.0,
+                        messages_and_choices=[
+                            {"role": "user", "content": "hello"},
+                            {"role": "assistant", "content": "hi"},
+                        ],
+                    )
+                ]
+            )
+
+        trainer = PipelineTrainer(
+            model=model,
+            backend=backend,
+            rollout_fn=rollout_fn,
+            scenarios=[{"metadata": {"scenario_id": "s1"}}],
+            config={},
+            num_rollout_workers=1,
+            min_batch_size=1,
+            max_batch_size=1,
+            eval_fn=None,
+        )
+        trainer._output_queue = asyncio.Queue()
+
+        await trainer._rollout_worker(worker_id=0)
+
+        assert observed_contexts == ["train"]
+
+    @pytest.mark.asyncio
+    async def test_pipeline_trainer_activates_eval_context_for_eval_fn(
+        self, tmp_path: Path
+    ) -> None:
+        model = TrainableModel(
+            name="pipeline-eval-context-test",
+            project="pipeline-eval-context-test",
+            base_model="test-model",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        backend = MagicMock()
+        observed_contexts: list[str] = []
+
+        @track_api_cost(
+            source="llm_judge/correctness",
+            provider="openai",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _judge_call() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=100, completion_tokens=50)
+
+        async def eval_fn(
+            _model: TrainableModel,
+            _step: int,
+            _config: dict,
+        ) -> list[Trajectory]:
+            observed_contexts.append(MetricsBuilder.get_active().cost_context)
+            await _judge_call()
+            return [
+                Trajectory(
+                    reward=1.0,
+                    messages_and_choices=[
+                        {"role": "user", "content": "hello"},
+                        {"role": "assistant", "content": "hi"},
+                    ],
+                )
+            ]
+
+        trainer = PipelineTrainer(
+            model=model,
+            backend=backend,
+            rollout_fn=lambda *_args, **_kwargs: asyncio.sleep(0),
+            scenarios=[],
+            config={},
+            num_rollout_workers=1,
+            min_batch_size=1,
+            max_batch_size=1,
+            eval_fn=eval_fn,
+        )
+
+        await trainer._run_eval(step=1)
+
+        assert observed_contexts == ["eval"]
+
+        history_path = (
+            tmp_path
+            / "pipeline-eval-context-test"
+            / "models"
+            / "pipeline-eval-context-test"
+            / "history.jsonl"
+        )
+        with open(history_path) as f:
+            rows = [json.loads(line) for line in f if line.strip()]
+
+        assert any("costs/eval/llm_judge/correctness" in row for row in rows)

From c91cf271f111825f991b57f85033d9f7eab639a1 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 18:24:01 -0800
Subject: [PATCH 13/46] docs: Add API cost decorator guide and smoke demo

---
 docs/metrics-taxonomy.md           | 57 +++++++++++++++++++++++++++++-
 examples/metrics_taxonomy_smoke.py | 34 ++++++++++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
index 1bbe3373..b5a2294c 100644
--- a/docs/metrics-taxonomy.md
+++ b/docs/metrics-taxonomy.md
@@ -1,4 +1,4 @@
-# Metrics Taxonomy (Phase 1)
+# Metrics Taxonomy (Phase 1-3)
 
 Phase 1 introduces sectioned metric namespaces and hierarchical cost rollups.
 
@@ -56,3 +56,58 @@ uv run python examples/metrics_taxonomy_smoke.py
 ```
 
 This writes a local history file and, if `WANDB_API_KEY` is set, logs the same metrics to W&B.
+
+## API Cost Decorator (Phase 2/3)
+
+Use `@track_api_cost` to automatically write judge/API spend into `costs/{train|eval}/...`.
+
+```python
+from art.metrics import track_api_cost
+
+@track_api_cost(
+    source="llm_judge/correctness",
+    provider="openai",
+    prompt_price_per_million=1.0,
+    completion_price_per_million=2.0,
+)
+async def run_judge(client, messages):
+    return await client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=messages,
+    )
+```
+
+Activate metric cost context while running train/eval logic:
+
+```python
+train_token = model.activate_metrics_context("train")
+try:
+    await run_judge(client, train_messages)
+finally:
+    train_token.var.reset(train_token)
+
+eval_token = model.activate_metrics_context("eval")
+try:
+    await run_judge(client, eval_messages)
+finally:
+    eval_token.var.reset(eval_token)
+```
+
+The next `model.log(...)` flush for that step will include:
+
+- `costs/train/llm_judge/correctness` (or `costs/eval/...`)
+- hierarchical rollups like `costs/train`, `costs/all`
+- cumulative keys like `costs/all_cum`
+
+Built-in providers:
+
+- OpenAI usage (`prompt_tokens`, `completion_tokens`)
+- Anthropic usage (`input_tokens`, `output_tokens`)
+
+You can override pricing per decorator call or configure builder-level defaults:
+
+```python
+builder = model.metrics_builder()
+builder.register_token_pricing("openai", prompt_per_million=1.2, completion_per_million=4.8)
+builder.register_cost_extractor("openai", lambda response: 0.001)  # optional custom extractor
+```
diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py
index 25e9de1c..28941b1f 100644
--- a/examples/metrics_taxonomy_smoke.py
+++ b/examples/metrics_taxonomy_smoke.py
@@ -5,6 +5,34 @@
 import time
 
 import art
+from art.metrics import track_api_cost
+
+
+class _Usage:
+    def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
+        self.prompt_tokens = prompt_tokens
+        self.completion_tokens = completion_tokens
+
+
+class _Response:
+    def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
+        self.usage = _Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+        )
+
+
+@track_api_cost(
+    source="llm_judge/decorator_demo",
+    provider="openai",
+    prompt_price_per_million=1.0,
+    completion_price_per_million=2.0,
+)
+async def _mock_judge_call(step: int) -> _Response:
+    return _Response(
+        prompt_tokens=50 * step,
+        completion_tokens=20 * step,
+    )
 
 
 async def main() -> None:
@@ -22,6 +50,12 @@ async def main() -> None:
     )
 
     for step in (1, 2):
+        train_token = model.activate_metrics_context("train")
+        try:
+            await _mock_judge_call(step)
+        finally:
+            train_token.var.reset(train_token)
+
         trajectories = [
             art.TrajectoryGroup(
                 trajectories=[

From 6fb0d8cc56d2f137a1dac9657670cfaab85e7966 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Wed, 4 Mar 2026 18:25:10 -0800
Subject: [PATCH 14/46] fix: Parse entity and project in metrics smoke config

---
 examples/metrics_taxonomy_smoke.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py
index 28941b1f..4f2c4a2f 100644
--- a/examples/metrics_taxonomy_smoke.py
+++ b/examples/metrics_taxonomy_smoke.py
@@ -36,7 +36,15 @@ async def _mock_judge_call(step: int) -> _Response:
 
 
 async def main() -> None:
-    project = os.environ.get("ART_METRICS_PROJECT", "metrics-taxonomy-smoke")
+    project_spec = os.environ.get("ART_METRICS_PROJECT", "metrics-taxonomy-smoke")
+    entity = os.environ.get("ART_METRICS_ENTITY")
+    project = project_spec
+    if entity is None and "/" in project_spec:
+        split_entity, split_project = project_spec.split("/", 1)
+        if split_entity and split_project:
+            entity = split_entity
+            project = split_project
+
     model_name = os.environ.get(
         "ART_METRICS_MODEL", f"metrics-smoke-{int(time.time())}"
     )
@@ -45,6 +53,7 @@ async def main() -> None:
     model = art.Model(
         name=model_name,
         project=project,
+        entity=entity,
         base_path=base_path,
         report_metrics=["wandb"],
     )

From 754ef57c319f248ff1acf82c73b97f501f2f4fab Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 10:36:04 -0700
Subject: [PATCH 15/46] test: Cover metrics builder resume and cumulative
 routing

---
 tests/unit/test_frontend_logging.py | 30 +++++++++++++++++++++++
 tests/unit/test_metrics_builder.py  | 37 +++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index c5feeefc..1ece42e6 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -588,6 +588,36 @@ async def test_cost_cumulative_persists_across_model_recreation(
         assert second["costs/train/prefill_cum"] == pytest.approx(1.0)
         assert second["costs/all_cum"] == pytest.approx(1.0)
 
+    @pytest.mark.asyncio
+    async def test_direct_time_and_data_metrics_get_cumulative_variants(
+        self, tmp_path: Path
+    ):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=1,
+            metrics={
+                "time/step_actor_s": 1.5,
+                "data/step_actor_tokens": 10,
+            },
+        )
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["time/step_actor_s"] == pytest.approx(1.5)
+        assert entry["time/step_actor_s_cum"] == pytest.approx(1.5)
+        assert entry["data/step_actor_tokens"] == pytest.approx(10)
+        assert entry["data/step_actor_tokens_cum"] == pytest.approx(10)
+
 
 class TestWandbIntegration:
     """Test wandb integration logic (without mocking wandb itself)."""
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
index 7b56c150..db083242 100644
--- a/tests/unit/test_metrics_builder.py
+++ b/tests/unit/test_metrics_builder.py
@@ -57,6 +57,27 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None:
         assert second["data/step_actor_tokens_cum"] == pytest.approx(15)
         assert second["data/cum_num_unique_scenarios"] == 3
 
+    @pytest.mark.asyncio
+    async def test_helper_metrics_accumulate_within_a_single_step(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        builder.add_data(step_num_scenarios=2, step_actor_tokens=10)
+        builder.add_data(step_num_scenarios=3, step_actor_tokens=5)
+        builder.add_user_timing(step_wall_s=1.5, step_actor_s=0.3, step_eval_s=0.2)
+        builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2, step_eval_s=0.1)
+        builder.add_idle_times(step_trainer_idle_s=1.0, step_actor_idle_s=2.0)
+        builder.add_idle_times(step_trainer_idle_s=0.5, step_actor_idle_s=1.0)
+
+        metrics = await builder.flush(step=1)
+
+        assert metrics["data/step_num_scenarios"] == pytest.approx(5)
+        assert metrics["data/step_actor_tokens"] == pytest.approx(15)
+        assert metrics["time/step_wall_s"] == pytest.approx(2.0)
+        assert metrics["time/step_actor_s"] == pytest.approx(0.5)
+        assert metrics["time/step_eval_s"] == pytest.approx(0.3)
+        assert metrics["throughput/step_trainer_idle_s"] == pytest.approx(1.5)
+        assert metrics["throughput/step_actor_idle_s"] == pytest.approx(3.0)
+
     @pytest.mark.asyncio
     async def test_costs_all_generated_for_single_and_multiple_children(self) -> None:
         single = MetricsBuilder(cost_context="train")
@@ -128,6 +149,22 @@ async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None:
         assert metrics["costs/train_cum"] == pytest.approx(3.0)
         assert metrics["costs/all_cum"] == pytest.approx(3.0)
 
+    @pytest.mark.asyncio
+    async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None:
+        before = MetricsBuilder(cost_context="train")
+        before.add_cost("train/gpu", usd=1.0)
+        await before.flush(step=1)
+
+        after = MetricsBuilder(cost_context="train")
+        after.load_state_dict(before.state_dict())
+
+        eval_builder = after.for_cost_context("eval")
+        eval_builder.add_cost("eval/judge", usd=2.0)
+
+        metrics = await eval_builder.flush(step=2)
+        assert metrics["costs/eval/judge"] == pytest.approx(2.0)
+        assert metrics["costs/all_cum"] == pytest.approx(3.0)
+
     @pytest.mark.asyncio
     async def test_unique_scenario_count_tracks_exact_ids(self) -> None:
         builder = MetricsBuilder(cost_context="train")

From 4659a5ba4128feb639dc14548ce34d10fcf6c582 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 10:37:25 -0700
Subject: [PATCH 16/46] fix: Restore MetricsBuilder cumulative state and
 routing

---
 src/art/metrics.py | 38 +++++++++++++++++++++++++++-----------
 src/art/model.py   |  4 ++++
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/src/art/metrics.py b/src/art/metrics.py
index 623aadcf..ea299427 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -164,7 +164,12 @@ def add_cost(self, path: str, usd: float) -> None:
         if not path:
             raise ValueError("Cost path must be non-empty")
         full_key = f"costs/{path}"
-        self._validate_and_add(full_key, float(usd))
+        self.add_metric(full_key, float(usd))
+
+    def add_metric(self, key: str, value: float) -> None:
+        if "/" not in key:
+            raise ValueError("Metric key must include a section prefix")
+        self._validate_and_add(key, float(value))
 
     def add_data(
         self,
@@ -173,9 +178,9 @@ def add_data(
         scenario_ids: list[str] | None = None,
     ) -> None:
         if step_num_scenarios is not None:
-            self._step_buffer["data/step_num_scenarios"] = float(step_num_scenarios)
+            self.add_metric("data/step_num_scenarios", float(step_num_scenarios))
         if step_actor_tokens is not None:
-            self._step_buffer["data/step_actor_tokens"] = float(step_actor_tokens)
+            self.add_metric("data/step_actor_tokens", float(step_actor_tokens))
         if scenario_ids is not None:
             self._unique_scenario_ids.update(scenario_ids)
 
@@ -186,11 +191,11 @@ def add_user_timing(
         step_eval_s: float | None = None,
     ) -> None:
         if step_wall_s is not None:
-            self._step_buffer["time/step_wall_s"] = float(step_wall_s)
+            self.add_metric("time/step_wall_s", float(step_wall_s))
         if step_actor_s is not None:
-            self._step_buffer["time/step_actor_s"] = float(step_actor_s)
+            self.add_metric("time/step_actor_s", float(step_actor_s))
         if step_eval_s is not None:
-            self._step_buffer["time/step_eval_s"] = float(step_eval_s)
+            self.add_metric("time/step_eval_s", float(step_eval_s))
 
     def add_idle_times(
         self,
@@ -198,11 +203,12 @@ def add_idle_times(
         step_actor_idle_s: float | None = None,
     ) -> None:
         if step_trainer_idle_s is not None:
-            self._step_buffer["throughput/step_trainer_idle_s"] = float(
-                step_trainer_idle_s
+            self.add_metric(
+                "throughput/step_trainer_idle_s",
+                float(step_trainer_idle_s),
             )
         if step_actor_idle_s is not None:
-            self._step_buffer["throughput/step_actor_idle_s"] = float(step_actor_idle_s)
+            self.add_metric("throughput/step_actor_idle_s", float(step_actor_idle_s))
 
     async def flush(self, step: int) -> dict[str, float]:
         del step
@@ -292,8 +298,18 @@ def state_dict(self) -> dict[str, Any]:
     def load_state_dict(self, state: dict[str, Any]) -> None:
         raw_cum_state = state.get("cum_state", {})
         raw_unique_ids = state.get("unique_scenario_ids", [])
-        self._cum_state = {str(k): float(v) for k, v in raw_cum_state.items()}
-        self._unique_scenario_ids = {str(v) for v in raw_unique_ids}
+        restored_cum_state = {str(k): float(v) for k, v in raw_cum_state.items()}
+        restored_unique_ids = {str(v) for v in raw_unique_ids}
+
+        self._shared_state.cum_state.clear()
+        self._shared_state.cum_state.update(restored_cum_state)
+        self._shared_state.unique_scenario_ids.clear()
+        self._shared_state.unique_scenario_ids.update(restored_unique_ids)
+
+        # Keep local references aligned with the shared state so derived builders
+        # created before or after resume observe the same cumulative state.
+        self._cum_state = self._shared_state.cum_state
+        self._unique_scenario_ids = self._shared_state.unique_scenario_ids
 
     def _validate_and_add(self, key: str, value: float) -> None:
         if key.endswith("_cum"):
diff --git a/src/art/model.py b/src/art/model.py
index 91f9e81b..f23cad63 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -33,6 +33,7 @@
 COSTS_METRIC_PREFIX = "costs_"
 COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total"
 METRICS_BUILDER_STATE_KEY = "_metrics_builder_state"
+BUILDER_CUMULATIVE_PREFIXES = ("time/step_", "data/step_")
 METRIC_SECTIONS = frozenset(
     {
         "reward",
@@ -508,6 +509,9 @@ def _extract_non_cost_metrics(
                         f"{cost_context}/{component}", numeric_value
                     )
                 continue
+            if metric.startswith(BUILDER_CUMULATIVE_PREFIXES):
+                self._metrics_builder.add_metric(metric, numeric_value)
+                continue
             non_cost_metrics[metric] = numeric_value
         return non_cost_metrics
 

From 26c7406467510f4d4a08393bb573d99276eb7e75 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 10:56:07 -0700
Subject: [PATCH 17/46] test: Cover taxonomy timing and data metrics

---
 tests/unit/test_frontend_logging.py | 82 +++++++++++++++++++++++++----
 tests/unit/test_metrics_builder.py  | 17 ++++++
 tests/unit/test_track_api_cost.py   |  1 +
 3 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 1ece42e6..3230af03 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -225,8 +225,9 @@ async def test_history_appends_entries(
         history_path = tmp_path / "test-project/models/test-model/history.jsonl"
         df = pl.read_ndjson(str(history_path))
 
-        # Should have 2 entries
-        assert len(df) == 2
+        # Each log call now emits the primary metrics row plus a taxonomy
+        # row for cumulative data/time metrics.
+        assert len(df) == 4
 
         # Check both splits are present
         columns = df.columns
@@ -506,6 +507,62 @@ async def test_train_trajectory_metrics_default_to_reward_prefix(
         assert entry["reward/custom_score"] == 1.0
         assert entry["reward/prefixed"] == 2.0
 
+    @pytest.mark.asyncio
+    async def test_train_logs_add_default_data_metrics_from_trajectory_groups(
+        self, tmp_path: Path
+    ):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        trajectories = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.8,
+                        messages_and_choices=[{"role": "user", "content": "a"}],
+                    ),
+                    Trajectory(
+                        reward=0.2,
+                        messages_and_choices=[{"role": "user", "content": "b"}],
+                    ),
+                ],
+                metadata={"scenario_scenario_id": "scenario-1"},
+            ),
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.5,
+                        messages_and_choices=[{"role": "user", "content": "c"}],
+                    )
+                ],
+                exceptions=[],
+                metadata={"scenario_scenario_id": "scenario-2"},
+            ),
+        ]
+
+        await model.log(trajectories, split="train", step=1)
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            rows = [json.loads(line) for line in f if line.strip()]
+
+        merged: dict[str, float] = {}
+        for row in rows:
+            merged.update(row)
+
+        assert merged["data/step_num_scenarios"] == pytest.approx(2.0)
+        assert merged["data/step_num_trajectories"] == pytest.approx(3.0)
+        assert merged["data/step_num_groups_submitted"] == pytest.approx(2.0)
+        assert merged["data/step_num_groups_trainable"] == pytest.approx(1.0)
+        assert merged["data/cum_num_unique_scenarios"] == pytest.approx(2.0)
+        assert merged["train/num_groups_submitted"] == pytest.approx(2.0)
+        assert merged["train/num_groups_trainable"] == pytest.approx(1.0)
+        assert merged["train/num_trajectories"] == pytest.approx(3.0)
+
     @pytest.mark.asyncio
     async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path):
         model = Model(
@@ -798,13 +855,18 @@ async def mock_train_sft(*args, **kwargs):
         with open(history_path) as f:
             lines = f.readlines()
 
-        assert len(lines) == 1, f"Expected 1 log entry, got {len(lines)}"
+        assert len(lines) == 2, f"Expected 2 log entries, got {len(lines)}"
+
+        entries = [json.loads(line) for line in lines]
+        merged: dict[str, float] = {}
+        for entry in entries:
+            merged.update(entry)
 
-        # Verify metrics are aggregated (averaged)
-        entry = json.loads(lines[0])
-        assert entry["step"] == 1
-        assert entry["loss/train"] == pytest.approx(0.8)  # (1.0 + 0.8 + 0.6) / 3
-        assert entry["loss/grad_norm"] == pytest.approx(0.4)  # (0.5 + 0.4 + 0.3) / 3
+        assert all(entry["step"] == 1 for entry in entries)
+        assert merged["loss/train"] == pytest.approx(0.8)  # (1.0 + 0.8 + 0.6) / 3
+        assert merged["loss/grad_norm"] == pytest.approx(0.4)  # (0.5 + 0.4 + 0.3) / 3
+        assert merged["time/step_trainer_s"] >= 0
+        assert merged["time/step_trainer_s_cum"] >= 0
 
     @pytest.mark.asyncio
     async def test_train_sft_single_step_increment(self, tmp_path: Path):
@@ -841,8 +903,8 @@ async def mock_train_sft(*args, **kwargs):
         history_path = tmp_path / "test-project/models/test-sft-step/history.jsonl"
         df = pl.read_ndjson(str(history_path))
 
-        assert len(df) == 1, "Should have exactly 1 log entry"
-        assert df["step"][0] == 1, "Step should be 1 (single increment)"
+        assert len(df) == 2, "Should have exactly 2 log entries"
+        assert set(df["step"].to_list()) == {1}, "Step should be 1 (single increment)"
 
     @pytest.mark.asyncio
     async def test_train_sft_no_metrics_when_empty(self, tmp_path: Path):
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
index db083242..6b032c05 100644
--- a/tests/unit/test_metrics_builder.py
+++ b/tests/unit/test_metrics_builder.py
@@ -78,6 +78,23 @@ async def test_helper_metrics_accumulate_within_a_single_step(self) -> None:
         assert metrics["throughput/step_trainer_idle_s"] == pytest.approx(1.5)
         assert metrics["throughput/step_actor_idle_s"] == pytest.approx(3.0)
 
+    @pytest.mark.asyncio
+    async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        builder.add_metric("time/step_trainer_s", 4.0)
+        builder.add_metric("data/step_trainer_tokens", 40.0)
+        builder.add_metric("time/step_actor_s", 2.0)
+        builder.add_metric("data/step_actor_tokens", 10.0)
+        builder.add_idle_times(step_trainer_idle_s=1.5, step_actor_idle_s=0.5)
+
+        metrics = await builder.flush(step=1)
+
+        assert metrics["throughput/cum_trainer_idle_s"] == pytest.approx(1.5)
+        assert metrics["throughput/cum_actor_idle_s"] == pytest.approx(0.5)
+        assert metrics["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
+        assert metrics["throughput/avg_actor_tok_per_s"] == pytest.approx(5.0)
+
     @pytest.mark.asyncio
     async def test_costs_all_generated_for_single_and_multiple_children(self) -> None:
         single = MetricsBuilder(cost_context="train")
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
index 4cb6dd34..09915d12 100644
--- a/tests/unit/test_track_api_cost.py
+++ b/tests/unit/test_track_api_cost.py
@@ -308,3 +308,4 @@ async def eval_fn(
             rows = [json.loads(line) for line in f if line.strip()]
 
         assert any("costs/eval/llm_judge/correctness" in row for row in rows)
+        assert any("time/step_eval_s" in row for row in rows)

From 02a3c58e897996ba6206c695016202ea11e42b26 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 10:56:10 -0700
Subject: [PATCH 18/46] feat: Emit time and data metrics across training flows

---
 src/art/local/backend.py            |  67 ++++++++++++++-----
 src/art/metrics.py                  |  38 +++++++++++
 src/art/metrics_taxonomy.py         | 100 ++++++++++++++++++++++++++++
 src/art/model.py                    |  58 +++++++++++++++-
 src/art/pipeline_trainer/trainer.py |  75 ++++++++++++++++++---
 src/art/serverless/backend.py       |  34 +++++++++-
 src/art/tinker_native/backend.py    |  13 +++-
 7 files changed, 355 insertions(+), 30 deletions(-)

diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index aecef80a..6ff4a53a 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -6,6 +6,7 @@
 import shutil
 import socket
 import subprocess
+import time
 from types import TracebackType
 from typing import AsyncIterator, Iterable, Literal, cast
 import warnings
@@ -39,7 +40,13 @@
 
 from .. import dev
 from ..backend import AnyTrainableModel, Backend
-from ..metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY, rename_train_metrics
+from ..metrics_taxonomy import (
+    TRAIN_GRADIENT_STEPS_KEY,
+    build_data_metrics_from_summary,
+    build_train_metrics_from_summary,
+    rename_train_metrics,
+    summarize_trajectory_groups,
+)
 from ..model import Model, TrainableModel
 from ..preprocessing.pack import (
     PackedTensors,
@@ -568,6 +575,7 @@ async def train(  # type: ignore[override]
 
         # Collect metrics from training
         training_metrics: list[dict[str, float]] = []
+        trainer_started = time.monotonic()
         async for metrics in self._train_model(
             model, groups_list, config, dev_config, verbose
         ):
@@ -582,6 +590,22 @@ async def train(  # type: ignore[override]
                 for k in {k for d in training_metrics for k in d}
                 if k != TRAIN_GRADIENT_STEPS_KEY
             }
+        summary = summarize_trajectory_groups(groups_list)
+        avg_metrics.setdefault(
+            "time/step_trainer_s", time.monotonic() - trainer_started
+        )
+        avg_metrics.update(
+            {
+                key: value
+                for key, value in {
+                    **build_data_metrics_from_summary(
+                        summary, include_trainable_groups=True
+                    ),
+                    **build_train_metrics_from_summary(summary),
+                }.items()
+                if key not in avg_metrics
+            }
+        )
 
         # Get step and checkpoint path
         step = await self._get_step(model)
@@ -619,13 +643,11 @@ async def _train_model(
         if verbose:
             print("Packing tensors...")
 
-        # Count submitted groups and trainable groups
-        num_groups_submitted = len(trajectory_groups)
-        num_groups_trainable = sum(
-            1
-            for group in trajectory_groups
-            if group and len(set(trajectory.reward for trajectory in group)) > 1
-        )
+        summary = summarize_trajectory_groups(trajectory_groups)
+        base_metrics = {
+            **build_data_metrics_from_summary(summary, include_trainable_groups=True),
+            **build_train_metrics_from_summary(summary),
+        }
 
         packed_tensors = self._get_packed_tensors(
             model,
@@ -687,16 +709,20 @@ async def _train_model(
             # Yield metrics showing no groups were trainable
             # (the frontend will handle logging)
             yield {
-                "train/num_groups_submitted": float(num_groups_submitted),
+                **base_metrics,
+                "data/step_num_groups_trainable": 0.0,
                 "train/num_groups_trainable": 0.0,
+                "data/step_trainer_tokens": 0.0,
                 TRAIN_GRADIENT_STEPS_KEY: 0.0,
             }
             return
+        base_metrics["data/step_trainer_tokens"] = float(
+            packed_tensors["assistant_mask"].sum().item()
+        )
         disk_packed_tensors = packed_tensors_to_dir(
             packed_tensors, f"{get_model_dir(model=model, art_path=self._path)}/tensors"
         )
         # Note: scale_learning_rate_by_reward_std_dev is now handled by the frontend (Model.train())
-        results: list[dict[str, float]] = []
         estimated_gradient_steps = disk_packed_tensors["num_sequences"]
         pbar = tqdm.tqdm(total=estimated_gradient_steps, desc="train")
         async for result in service.train(
@@ -709,8 +735,11 @@ async def _train_model(
             assert num_gradient_steps == estimated_gradient_steps, (
                 f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}"
             )
-            results.append(result)
-            yield {**result, TRAIN_GRADIENT_STEPS_KEY: float(num_gradient_steps)}
+            yield {
+                **base_metrics,
+                **result,
+                TRAIN_GRADIENT_STEPS_KEY: float(num_gradient_steps),
+            }
             pbar.update(1)
             pbar.set_postfix(result)
         pbar.close()
@@ -797,15 +826,21 @@ async def _train_sft(
         service = await self._get_service(model)
 
         pbar = tqdm.tqdm(total=len(batches), desc="sft train")
-        total_trainable_tokens = 0
+        total_trainable_tokens = sum(batch.num_trainable_tokens for batch in batches)
+        total_trajectories = len(trajectory_list)
         batch_count = 0
 
         async for result in service.train_sft(batches, verbose):
             pbar.update(1)
-            pbar.set_postfix({"loss": f"{result.get('loss', 0):.4f}"})
-            total_trainable_tokens += result.get("num_trainable_tokens", 0)
+            pbar.set_postfix({"loss": f"{result.get('loss/train', 0):.4f}"})
             batch_count += 1
-            yield result
+            yield {
+                **result,
+                "data/step_num_trajectories": float(total_trajectories),
+                "data/step_trainer_tokens": float(total_trainable_tokens),
+                "train/num_trajectories": float(total_trajectories),
+                "train/num_trainable_tokens": float(total_trainable_tokens),
+            }
 
         pbar.close()
 
diff --git a/src/art/metrics.py b/src/art/metrics.py
index ea299427..f31e0b6f 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -7,6 +7,7 @@
 from dataclasses import dataclass
 from functools import wraps
 from inspect import iscoroutinefunction
+import time
 from typing import Any, ParamSpec, TypeVar
 
 from .costs import tokens_to_cost
@@ -14,6 +15,10 @@
 _active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder")
 
 _HIERARCHICAL_SECTIONS = {"costs", "time", "data"}
+_THROUGHPUT_IDLE_MAPPINGS = {
+    "throughput/step_trainer_idle_s": "throughput/cum_trainer_idle_s",
+    "throughput/step_actor_idle_s": "throughput/cum_actor_idle_s",
+}
 _DEFAULT_PROVIDER = "openai"
 _OPENAI_PROVIDER = "openai"
 _ANTHROPIC_PROVIDER = "anthropic"
@@ -210,6 +215,14 @@ def add_idle_times(
         if step_actor_idle_s is not None:
             self.add_metric("throughput/step_actor_idle_s", float(step_actor_idle_s))
 
+    @contextmanager
+    def measure(self, key: str):
+        started = time.monotonic()
+        try:
+            yield
+        finally:
+            self.add_metric(key, time.monotonic() - started)
+
     async def flush(self, step: int) -> dict[str, float]:
         del step
         async with self._lock:
@@ -237,6 +250,7 @@ async def flush(self, step: int) -> dict[str, float]:
                     len(self._unique_scenario_ids)
                 )
 
+            self._update_throughput_metrics(result)
             self._step_buffer.clear()
             return result
 
@@ -369,6 +383,30 @@ def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]:
 
         return rollups
 
+    def _update_throughput_metrics(self, result: dict[str, float]) -> None:
+        for step_key, cum_key in _THROUGHPUT_IDLE_MAPPINGS.items():
+            if step_key not in result:
+                continue
+            next_value = self._cum_state.get(cum_key, 0.0) + result[step_key]
+            self._cum_state[cum_key] = next_value
+            result[cum_key] = next_value
+
+        trainer_tokens = self._cum_state.get("data/step_trainer_tokens_cum")
+        trainer_seconds = self._cum_state.get("time/step_trainer_s_cum")
+        if (
+            trainer_tokens is not None
+            and trainer_seconds is not None
+            and trainer_seconds > 0
+        ):
+            result["throughput/avg_trainer_tok_per_s"] = (
+                trainer_tokens / trainer_seconds
+            )
+
+        actor_tokens = self._cum_state.get("data/step_actor_tokens_cum")
+        actor_seconds = self._cum_state.get("time/step_actor_s_cum")
+        if actor_tokens is not None and actor_seconds is not None and actor_seconds > 0:
+            result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds
+
     def _resolve_token_pricing(
         self,
         provider: str | None,
diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py
index e7d108d4..79a175c3 100644
--- a/src/art/metrics_taxonomy.py
+++ b/src/art/metrics_taxonomy.py
@@ -1,5 +1,19 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Iterable
+
+from .trajectories import TrajectoryGroup
+
 TRAIN_GRADIENT_STEPS_KEY = "data/step_num_gradient_steps"
 
+_SCENARIO_ID_CANDIDATE_KEYS = (
+    "scenario_id",
+    "scenario_scenario_id",
+    "scenario_idx",
+    "scenario_scenario_idx",
+)
+
 TRAIN_METRIC_KEY_RENAMES = {
     "reward": "reward/mean",
     "reward_std_dev": "reward/std_dev",
@@ -29,3 +43,89 @@ def rename_train_metric_key(metric: str) -> str:
 
 def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]:
     return {rename_train_metric_key(key): float(value) for key, value in metrics.items()}
+
+
+@dataclass(frozen=True)
+class TrajectoryBatchSummary:
+    num_scenarios: int
+    num_trajectories: int
+    num_groups_submitted: int
+    num_groups_trainable: int
+    scenario_ids: list[str]
+
+
+def summarize_trajectory_groups(
+    trajectory_groups: Iterable[TrajectoryGroup],
+) -> TrajectoryBatchSummary:
+    groups = list(trajectory_groups)
+    scenario_ids: list[str] = []
+    seen_scenario_ids: set[str] = set()
+
+    for group in groups:
+        scenario_id = _extract_scenario_id(group)
+        if scenario_id is None or scenario_id in seen_scenario_ids:
+            continue
+        seen_scenario_ids.add(scenario_id)
+        scenario_ids.append(scenario_id)
+
+    return TrajectoryBatchSummary(
+        num_scenarios=len(groups),
+        num_trajectories=sum(len(group.trajectories) + len(group.exceptions) for group in groups),
+        num_groups_submitted=len(groups),
+        num_groups_trainable=sum(1 for group in groups if _group_is_trainable(group)),
+        scenario_ids=scenario_ids,
+    )
+
+
+def build_data_metrics_from_summary(
+    summary: TrajectoryBatchSummary,
+    *,
+    include_trainable_groups: bool,
+) -> dict[str, float]:
+    metrics = {
+        "data/step_num_scenarios": float(summary.num_scenarios),
+        "data/step_num_trajectories": float(summary.num_trajectories),
+        "data/step_num_groups_submitted": float(summary.num_groups_submitted),
+    }
+    if include_trainable_groups:
+        metrics["data/step_num_groups_trainable"] = float(summary.num_groups_trainable)
+    return metrics
+
+
+def build_train_metrics_from_summary(
+    summary: TrajectoryBatchSummary,
+) -> dict[str, float]:
+    return {
+        "train/num_groups_submitted": float(summary.num_groups_submitted),
+        "train/num_groups_trainable": float(summary.num_groups_trainable),
+        "train/num_trajectories": float(summary.num_trajectories),
+    }
+
+
+def _group_is_trainable(group: TrajectoryGroup) -> bool:
+    rewards = [trajectory.reward for trajectory in group.trajectories]
+    return len(rewards) > 1 and len(set(rewards)) > 1
+
+
+def _extract_scenario_id(group: TrajectoryGroup) -> str | None:
+    for metadata in [group.metadata, *(trajectory.metadata for trajectory in group.trajectories)]:
+        scenario_id = _extract_scenario_id_from_metadata(metadata)
+        if scenario_id is not None:
+            return scenario_id
+    return None
+
+
+def _extract_scenario_id_from_metadata(
+    metadata: dict[str, Any],
+) -> str | None:
+    for key in _SCENARIO_ID_CANDIDATE_KEYS:
+        value = metadata.get(key)
+        if value is not None:
+            return str(value)
+
+    for key, value in metadata.items():
+        if value is None:
+            continue
+        if key.endswith("scenario_id") or key.endswith("scenario_idx"):
+            return str(value)
+    return None
diff --git a/src/art/model.py b/src/art/model.py
index f23cad63..c52705c5 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -16,7 +16,12 @@
 from . import dev
 from .costs import CostCalculator
 from .metrics import MetricsBuilder
-from .metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY
+from .metrics_taxonomy import (
+    TRAIN_GRADIENT_STEPS_KEY,
+    build_data_metrics_from_summary,
+    build_train_metrics_from_summary,
+    summarize_trajectory_groups,
+)
 from .trajectories import Trajectory, TrajectoryGroup
 from .types import TrainConfig, TrainSFTConfig
 from .utils.trajectory_logging import write_trajectory_groups_parquet
@@ -33,7 +38,7 @@
 COSTS_METRIC_PREFIX = "costs_"
 COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total"
 METRICS_BUILDER_STATE_KEY = "_metrics_builder_state"
-BUILDER_CUMULATIVE_PREFIXES = ("time/step_", "data/step_")
+BUILDER_CUMULATIVE_PREFIXES = ("time/step_", "data/step_", "throughput/step_")
 METRIC_SECTIONS = frozenset(
     {
         "reward",
@@ -515,6 +520,39 @@ def _extract_non_cost_metrics(
             non_cost_metrics[metric] = numeric_value
         return non_cost_metrics
 
+    def _add_default_step_metrics(
+        self,
+        trajectory_groups: list[TrajectoryGroup],
+        *,
+        split: str,
+        provided_metric_keys: set[str],
+    ) -> dict[str, float]:
+        if split not in METRIC_SPLITS:
+            return {}
+
+        summary = summarize_trajectory_groups(trajectory_groups)
+        default_data_metrics = build_data_metrics_from_summary(
+            summary,
+            include_trainable_groups=split == "train",
+        )
+        for key, value in default_data_metrics.items():
+            if key in provided_metric_keys:
+                continue
+            self._metrics_builder.add_metric(key, value)
+
+        if summary.scenario_ids:
+            self._metrics_builder.add_data(scenario_ids=summary.scenario_ids)
+
+        if split != "train":
+            return {}
+
+        default_train_metrics = build_train_metrics_from_summary(summary)
+        return {
+            key: value
+            for key, value in default_train_metrics.items()
+            if key not in provided_metric_keys
+        }
+
     def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder:
         if cost_context is None:
             return self._metrics_builder
@@ -595,6 +633,12 @@ async def log(
         else:
             trajectory_groups = cast(list[TrajectoryGroup], list(trajectories))
 
+        default_train_metrics = self._add_default_step_metrics(
+            trajectory_groups,
+            split=split,
+            provided_metric_keys=set(metrics or {}),
+        )
+
         # Ensure output directories exist
         output_dir = self._get_output_dir()
         trajectories_dir = f"{output_dir}/trajectories/{split}"
@@ -663,6 +707,8 @@ async def log(
             if len(values) > 0:
                 averages[metric] = sum(values) / len(values)
 
+        averages.update(default_train_metrics)
+
         # Aggregate group-level metrics once per group
         for metric, values in group_metrics.items():
             if len(values) > 0:
@@ -907,6 +953,7 @@ async def train(
 
         # 1. Train (backend no longer logs internally)
         training_metrics: list[dict[str, float]] = []
+        trainer_started = time.monotonic()
         async for metrics in self.backend()._train_model(
             self,
             groups_list,
@@ -915,6 +962,7 @@ async def train(
             verbose,
         ):
             training_metrics.append(metrics)
+        trainer_elapsed = time.monotonic() - trainer_started
 
         # 2. Calculate aggregated training metrics
         avg_metrics: dict[str, float] = {}
@@ -925,6 +973,7 @@ async def train(
                 for k in {k for d in training_metrics for k in d}
                 if k != TRAIN_GRADIENT_STEPS_KEY
             }
+        avg_metrics.setdefault("time/step_trainer_s", trainer_elapsed)
 
         # 3. Log trajectories and training metrics together (single wandb log call)
         step = await self.get_step()
@@ -955,6 +1004,7 @@ async def train_sft(
         # Collect all metrics and aggregate them at the end (same as RL)
         _config = _config or {}  # ty:ignore[invalid-assignment]
         training_metrics: list[dict[str, float]] = []
+        trainer_started = time.monotonic()
         async for metrics in self.backend()._train_sft(
             self,
             trajectories,
@@ -963,6 +1013,7 @@ async def train_sft(
             verbose,
         ):
             training_metrics.append(metrics)
+        trainer_elapsed = time.monotonic() - trainer_started
 
         # Log aggregated training metrics once (same as RL)
         if training_metrics:
@@ -971,6 +1022,7 @@ async def train_sft(
                 / sum(1 for d in training_metrics if k in d)
                 for k in {k for d in training_metrics for k in d}
             }
+            avg_metrics["time/step_trainer_s"] = trainer_elapsed
             # Get the current step after training
             step = await self.get_step()
-            self._log_metrics(avg_metrics, "train", step)
+            await self.log(trajectories=None, split="train", metrics=avg_metrics, step=step)
diff --git a/src/art/pipeline_trainer/trainer.py b/src/art/pipeline_trainer/trainer.py
index 9dcec1cd..a32ad1b8 100644
--- a/src/art/pipeline_trainer/trainer.py
+++ b/src/art/pipeline_trainer/trainer.py
@@ -16,6 +16,8 @@
 from .types import ConfigT, EvalFn, RolloutFn, ScenarioT, SingleRolloutFn  # noqa: F401
 
 PIPELINE_STATE_KEY = "_pipeline_trainer"
+_ROLLOUT_WALL_TIME_KEY = "_art_rollout_wall_s"
+_ACTOR_IDLE_TIME_KEY = "_art_actor_idle_s"
 
 
 def _to_async_iterator(iterable: Iterable[T] | AsyncIterator[T]) -> AsyncIterator[T]:
@@ -322,17 +324,21 @@ async def _rollout_worker(self, worker_id: int) -> None:
             self._status.note_rollout_started()
             errored = False
             try:
+                wait_started = time.monotonic()
                 await self._wait_for_policy()
+                actor_idle_s = time.monotonic() - wait_started
                 if self.state.done:
                     break
 
                 initial_version = self.state.policy_version
 
                 token = self.model.activate_metrics_context("train")
+                rollout_started = time.monotonic()
                 try:
                     group = await self.rollout_fn(self.model, scenario, self.config)
                 finally:
                     token.var.reset(token)
+                rollout_wall_s = time.monotonic() - rollout_started
                 if not isinstance(group, TrajectoryGroup):
                     errored = True
                     continue
@@ -344,7 +350,9 @@ async def _rollout_worker(self, worker_id: int) -> None:
                 )
                 if self.state.done:
                     break
-                await self._put_output_group(group)
+                queue_wait_s = await self._put_output_group(group)
+                group.metadata[_ROLLOUT_WALL_TIME_KEY] = rollout_wall_s
+                group.metadata[_ACTOR_IDLE_TIME_KEY] = actor_idle_s + queue_wait_s
             except asyncio.CancelledError:
                 raise
             except Exception as exc:
@@ -383,13 +391,17 @@ async def _training_stage(self) -> None:
             if stop_at_step is not None and current_step >= stop_at_step:
                 break
             step_start = time.monotonic()
+            collect_started = time.monotonic()
             batch, discarded, saw_sentinel = await self._collect_batch(current_step)
+            trainer_idle_s = time.monotonic() - collect_started
             self.state.discarded_stale_samples += discarded
             if discarded:
                 self._status.note_stale(discarded)
             if not batch:
                 break
 
+            actor_wall_s, actor_idle_s = self._consume_batch_rollout_timings(batch)
+
             expected_step = current_step + 1
             should_eval_step = self._should_eval_step(expected_step)
             should_checkpoint = self.save_checkpoint and should_eval_step
@@ -399,10 +411,9 @@ async def _training_stage(self) -> None:
                 self.state.policy_updated.notify_all()
 
             self._status.note_training_start(len(batch))
-            train_call_start: float | None = None
+            train_call_start = time.monotonic()
             if os.getenv("ART_TRAIN_STEP_LOG"):
                 print(f"[train] step {expected_step} starting (batch={len(batch)})")
-                train_call_start = time.perf_counter()
             try:
                 result = await self.backend.train(
                     self.model,
@@ -418,8 +429,8 @@ async def _training_stage(self) -> None:
                 self._status.note_training_end()
                 raise
             finally:
-                if train_call_start is not None:
-                    train_call_elapsed = time.perf_counter() - train_call_start
+                train_call_elapsed = time.monotonic() - train_call_start
+                if os.getenv("ART_TRAIN_STEP_LOG"):
                     print(
                         f"[train] step {expected_step} done in "
                         f"{train_call_elapsed:.1f}s"
@@ -442,7 +453,14 @@ async def _training_stage(self) -> None:
                     ),
                     "steps_off_policy": steps_off_policy,
                     "num_groups": float(len(batch)),
+                    "time/step_wall_s": step_seconds,
+                    "throughput/step_trainer_idle_s": trainer_idle_s,
                 }
+                metrics.setdefault("time/step_trainer_s", train_call_elapsed)
+                if actor_wall_s > 0:
+                    metrics["time/step_actor_s"] = actor_wall_s
+                if actor_idle_s > 0:
+                    metrics["throughput/step_actor_idle_s"] = actor_idle_s
                 metrics.update(result.metrics)
 
                 await self.model.log(
@@ -565,18 +583,22 @@ async def _run_eval(self, step: int) -> None:
         assert self.eval_fn is not None
         self._status.note_val_started(step)
         reward: float | None = None
+        eval_elapsed = 0.0
         try:
             token = self.model.activate_metrics_context("eval")
+            eval_started = time.monotonic()
             try:
                 result = await self.eval_fn(self.model, step, self.config)
             finally:
                 token.var.reset(token)
+                eval_elapsed = time.monotonic() - eval_started
             splits: dict[str, list[art.Trajectory | art.TrajectoryGroup]]
             if isinstance(result, dict):
                 splits = result
             else:
                 splits = {"val": result}
 
+            logged_eval_timing = False
             for split_name, items in splits.items():
                 groups, trajectories = self._normalize_eval_items(items)
                 if split_name == "val":
@@ -585,7 +607,25 @@ async def _run_eval(self, step: int) -> None:
                     else:
                         reward = None
                 if groups:
-                    await self.model.log(groups, split=split_name, step=step)
+                    metrics = (
+                        {"time/step_eval_s": eval_elapsed}
+                        if not logged_eval_timing
+                        else None
+                    )
+                    await self.model.log(
+                        groups,
+                        split=split_name,
+                        step=step,
+                        metrics=metrics,
+                    )
+                    logged_eval_timing = True
+            if not logged_eval_timing and eval_elapsed > 0:
+                await self.model.log(
+                    trajectories=None,
+                    split="val",
+                    step=step,
+                    metrics={"time/step_eval_s": eval_elapsed},
+                )
         except asyncio.CancelledError:
             raise
         except Exception as exc:
@@ -742,12 +782,31 @@ def _persist_state(self, training_step: int) -> None:
     def _is_scalar_metadata(value: object) -> bool:
         return value is None or isinstance(value, (str, int, float, bool))
 
-    async def _put_output_group(self, group: TrajectoryGroup) -> None:
+    async def _put_output_group(self, group: TrajectoryGroup) -> float:
         assert self._output_queue is not None
+        queue_wait_started = time.monotonic()
         while not self.state.done:
             try:
                 await asyncio.wait_for(self._output_queue.put(group), timeout=1.0)
                 self._status.note_group_enqueued(group)
-                return
+                return time.monotonic() - queue_wait_started
             except asyncio.TimeoutError:
                 continue
+        return time.monotonic() - queue_wait_started
+
+    def _consume_batch_rollout_timings(
+        self, batch: list[TrajectoryGroup]
+    ) -> tuple[float, float]:
+        rollout_wall_s = 0.0
+        actor_idle_s = 0.0
+        for group in batch:
+            rollout_wall_s += self._pop_float_metadata(group, _ROLLOUT_WALL_TIME_KEY)
+            actor_idle_s += self._pop_float_metadata(group, _ACTOR_IDLE_TIME_KEY)
+        return rollout_wall_s, actor_idle_s
+
+    @staticmethod
+    def _pop_float_metadata(group: TrajectoryGroup, key: str) -> float:
+        value = group.metadata.pop(key, 0.0)
+        if isinstance(value, (int, float)):
+            return float(value)
+        return 0.0
diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py
index f9ab8c5f..ede695cc 100644
--- a/src/art/serverless/backend.py
+++ b/src/art/serverless/backend.py
@@ -1,4 +1,5 @@
 import asyncio
+import time
 from typing import TYPE_CHECKING, Any, AsyncIterator, Iterable, Literal
 import warnings
 
@@ -9,7 +10,13 @@
 
 from .. import dev
 from ..backend import AnyTrainableModel, Backend
-from ..metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY, rename_train_metrics
+from ..metrics_taxonomy import (
+    TRAIN_GRADIENT_STEPS_KEY,
+    build_data_metrics_from_summary,
+    build_train_metrics_from_summary,
+    rename_train_metrics,
+    summarize_trajectory_groups,
+)
 from ..trajectories import Trajectory, TrajectoryGroup
 from ..types import ServerlessTrainResult, TrainConfig, TrainSFTConfig
 from ..utils.record_provenance import record_provenance
@@ -236,6 +243,7 @@ async def train(  # type: ignore[override]
 
         # Collect metrics from training
         training_metrics: list[dict[str, float]] = []
+        trainer_started = time.monotonic()
         async for metrics in self._train_model(
             model, groups_list, config, dev_config, verbose
         ):
@@ -250,6 +258,22 @@ async def train(  # type: ignore[override]
                 for k in {k for d in training_metrics for k in d}
                 if k != TRAIN_GRADIENT_STEPS_KEY
             }
+        summary = summarize_trajectory_groups(groups_list)
+        avg_metrics.setdefault(
+            "time/step_trainer_s", time.monotonic() - trainer_started
+        )
+        avg_metrics.update(
+            {
+                key: value
+                for key, value in {
+                    **build_data_metrics_from_summary(
+                        summary, include_trainable_groups=True
+                    ),
+                    **build_train_metrics_from_summary(summary),
+                }.items()
+                if key not in avg_metrics
+            }
+        )
 
         # Get step and artifact name
         step = await self._get_step(model)
@@ -276,6 +300,11 @@ async def _train_model(
         dev_config: dev.TrainConfig,
         verbose: bool = False,
     ) -> AsyncIterator[dict[str, float]]:
+        summary = summarize_trajectory_groups(trajectory_groups)
+        base_metrics = {
+            **build_data_metrics_from_summary(summary, include_trainable_groups=True),
+            **build_train_metrics_from_summary(summary),
+        }
         assert model.id is not None, "Model ID is required"
         training_job = await self._client.training_jobs.create(  # ty:ignore[possibly-missing-attribute]
             model_id=model.id,
@@ -312,6 +341,7 @@ async def _train_model(
                         {k: float(v) for k, v in event.data.items()}
                     )
                     yield {
+                        **base_metrics,
                         **metrics,
                         TRAIN_GRADIENT_STEPS_KEY: float(num_sequences),
                     }
@@ -484,6 +514,8 @@ async def _train_sft(
                     )
                     yield {
                         **metrics,
+                        "data/step_num_trajectories": float(num_trajectories),
+                        "train/num_trajectories": float(num_trajectories),
                         TRAIN_GRADIENT_STEPS_KEY: float(num_batches),
                     }
                 elif event.type == "training_started":
diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py
index 19df73dd..0c6a1654 100644
--- a/src/art/tinker_native/backend.py
+++ b/src/art/tinker_native/backend.py
@@ -30,7 +30,12 @@
 from .. import dev
 from ..backend import Backend
 from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing
-from ..metrics_taxonomy import rename_train_metric_key
+from ..metrics_taxonomy import (
+    build_data_metrics_from_summary,
+    build_train_metrics_from_summary,
+    rename_train_metric_key,
+    summarize_trajectory_groups,
+)
 from ..model import Model, TrainableModel
 from ..tinker.backend import get_renderer_name
 from ..tinker.server import get_free_port
@@ -209,6 +214,7 @@ async def train(  # type: ignore[override]
     ) -> TrainResult:
         state = self._model_state[model.name]
         groups_list = list(trajectory_groups)
+        summary = summarize_trajectory_groups(groups_list)
 
         datums = trajectory_groups_to_datums(
             groups_list,
@@ -218,7 +224,8 @@ async def train(  # type: ignore[override]
         )
 
         metrics: dict[str, float] = {
-            "train/num_groups_submitted": float(len(groups_list)),
+            **build_data_metrics_from_summary(summary, include_trainable_groups=True),
+            **build_train_metrics_from_summary(summary),
             "data/step_num_datums": float(len(datums)),
         }
 
@@ -234,6 +241,7 @@ async def train(  # type: ignore[override]
             metrics["costs/train/tinker_train"] = compute_train_cost(
                 train_tokens, pricing
             )
+        trainer_started = time.monotonic()
 
         if adam_params is None:
             adam_params = tinker.AdamParams(
@@ -301,6 +309,7 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum:
 
         state.current_step = next_step
         self._persist_model_state(model, state)
+        metrics["time/step_trainer_s"] = time.monotonic() - trainer_started
 
         return TrainResult(step=state.current_step, metrics=metrics)
 

From 59de04d2e0cb40016bec1edc6c75e1f27ff98261 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 10:56:13 -0700
Subject: [PATCH 19/46] docs: Document auto-emitted taxonomy metrics

---
 docs/metrics-taxonomy.md | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
index b5a2294c..300bdf0b 100644
--- a/docs/metrics-taxonomy.md
+++ b/docs/metrics-taxonomy.md
@@ -47,6 +47,44 @@ ART rolls costs up automatically:
 - parent rollups (for example `costs/train`, `costs/all`)
 - cumulative keys with `_cum` suffix (for example `costs/all_cum`)
 
+## Metrics Added By ART
+
+ART now emits the following metrics from library internals where the data is available:
+
+- `reward/*` aggregates from `model.log(..., split="train")`
+- `loss/*` from trainer backends
+- `time/wall_clock_sec` and `training_step` on every logged row
+- `time/step_trainer_s` for training calls
+- `time/step_wall_s`, `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer`
+- `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted`
+- `data/step_num_groups_trainable` for train splits
+- `data/cum_num_unique_scenarios` when scenario IDs are present in group or trajectory metadata
+- `data/step_trainer_tokens` where the backend knows the trainer token count
+- `throughput/cum_trainer_idle_s`, `throughput/cum_actor_idle_s`
+- `throughput/avg_trainer_tok_per_s`, `throughput/avg_actor_tok_per_s` when both token and time inputs are available
+
+Some metrics remain user-owned because ART cannot infer them reliably for every workflow, especially actor token usage outside the pipeline trainer.
+
+## User Helpers
+
+Use the builder helpers for step-level metrics that only user code can know:
+
+```python
+builder = model.metrics_builder()
+
+with builder.measure("time/step_actor_s"):
+    result = await run_rollouts()
+
+builder.add_data(
+    step_actor_tokens=result.actor_tokens,
+    scenario_ids=result.scenario_ids,
+)
+
+builder.add_idle_times(step_actor_idle_s=result.actor_idle_s)
+```
+
+If these metrics are logged before the next `model.log(...)` flush, ART will also emit the cumulative and derived throughput metrics automatically.
+
 ## End-to-End Smoke Test
 
 Run:

From a9ce32fab14bd7fe02bd7213164694c074e4a519 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 14:04:04 -0700
Subject: [PATCH 20/46] feat: Add yes-no-maybe metrics example

---
 dev/yes-no-maybe-metrics.py | 261 ++++++++++++++++++++++++++++++++++++
 1 file changed, 261 insertions(+)
 create mode 100644 dev/yes-no-maybe-metrics.py

diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py
new file mode 100644
index 00000000..32729990
--- /dev/null
+++ b/dev/yes-no-maybe-metrics.py
@@ -0,0 +1,261 @@
+"""Yes-no-maybe metrics demo for the LocalBackend `model.train()` path.
+
+This keeps the same prompt family, rollout structure, and reward ordering as
+`dev/yes-no-maybe.py` while adding explicit metrics taxonomy instrumentation for
+actor/eval timing and data metrics.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from itertools import permutations
+import os
+import time
+
+from dotenv import load_dotenv
+import openai
+
+try:
+    import unsloth  # noqa: F401
+except ImportError:
+    pass
+
+import art
+from art.local import LocalBackend
+
+
+async def create_chat_completion(
+    client: openai.AsyncOpenAI,
+    *,
+    model_name: str,
+    messages: art.Messages,
+    max_tokens: int,
+    timeout: float,
+) -> openai.types.chat.chat_completion.ChatCompletion:
+    return await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        max_tokens=max_tokens,
+        timeout=timeout,
+    )
+
+
+def with_quotes(word: str) -> str:
+    return f"'{word}'"
+
+
+def build_prompts() -> list[str]:
+    return [
+        f"{prefix} with {', '.join([with_quotes(word) if use_quotes else word for word in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}"
+        for prefix in ["respond", "just respond"]
+        for use_quotes in [True, False]
+        for words in (
+            list(permutation)
+            for length in [3, 2]
+            for permutation in permutations(["yes", "no", "maybe"], length)
+        )
+    ]
+
+
+def reward_for_answer(content: str | None) -> float:
+    if content == "yes":
+        return 0.5
+    if content == "no":
+        return 0.75
+    if content == "maybe":
+        return 1.0
+    return 0.0
+
+
+def scenario_id_for_prompt(prompt: str) -> str:
+    return prompt.replace(" ", "_").replace("'", "")
+
+
+def response_total_tokens(
+    response: openai.types.chat.chat_completion.ChatCompletion,
+) -> int:
+    usage = response.usage
+    if usage is None:
+        return 0
+    prompt_tokens = int(usage.prompt_tokens or 0)
+    completion_tokens = int(usage.completion_tokens or 0)
+    return prompt_tokens + completion_tokens
+
+
+def total_actor_tokens(groups: list[art.TrajectoryGroup]) -> int:
+    return sum(
+        int(trajectory.metadata.get("actor_total_tokens", 0) or 0)
+        for group in groups
+        for trajectory in group.trajectories
+    )
+
+
+async def rollout(
+    client: openai.AsyncOpenAI,
+    model: art.TrainableModel,
+    prompt: str,
+    *,
+    max_tokens: int,
+    timeout: float,
+) -> art.Trajectory:
+    messages: art.Messages = [{"role": "user", "content": prompt}]
+    chat_completion = await create_chat_completion(
+        client,
+        model_name=model.get_inference_name(),
+        messages=messages,
+        max_tokens=max_tokens,
+        timeout=timeout,
+    )
+    choice = chat_completion.choices[0]
+    content = choice.message.content
+    return art.Trajectory(
+        messages_and_choices=[*messages, choice],
+        reward=reward_for_answer(content),
+        metadata={
+            "scenario_id": scenario_id_for_prompt(prompt),
+            "actor_total_tokens": response_total_tokens(chat_completion),
+        },
+        metrics={
+            "valid_answer": reward_for_answer(content) > 0.0,
+        },
+    )
+
+
+async def evaluate(
+    client: openai.AsyncOpenAI,
+    model: art.TrainableModel,
+    prompts: list[str],
+    *,
+    max_tokens: int,
+    timeout: float,
+) -> list[art.TrajectoryGroup]:
+    groups = await art.gather_trajectory_groups(
+        art.TrajectoryGroup(
+            [
+                rollout(
+                    client,
+                    model,
+                    prompt,
+                    max_tokens=max_tokens,
+                    timeout=timeout,
+                )
+            ],
+            metadata={"scenario_id": scenario_id_for_prompt(prompt)},
+        )
+        for prompt in prompts
+    )
+    return groups
+
+
+def print_history_summary(model: art.TrainableModel) -> None:
+    history_path = (
+        model.base_path + f"/{model.project}/models/{model.name}/history.jsonl"
+    )
+    print(f"History: {history_path}")
+
+
+def build_internal_config() -> art.dev.InternalModelConfig:
+    return art.dev.InternalModelConfig(
+        engine_args=art.dev.EngineArgs(
+            gpu_memory_utilization=float(
+                os.environ.get("GPU_MEMORY_UTILIZATION", "0.85")
+            ),
+            max_model_len=int(os.environ.get("MAX_MODEL_LEN", "4096")),
+        )
+    )
+
+
+async def main() -> None:
+    load_dotenv()
+
+    backend = LocalBackend()
+    base_model = os.environ.get("BASE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
+    project = os.environ.get("PROJECT", "yes-no-maybe-metrics")
+    model = art.TrainableModel(
+        name=os.environ.get(
+            "MODEL_NAME", f"yes-no-maybe-metrics-{int(time.time())}"
+        ),
+        project=project,
+        base_model=base_model,
+        report_metrics=["wandb"],
+        _internal_config=build_internal_config(),
+    )
+    try:
+        await model.register(backend)
+
+        prompts = build_prompts()
+        eval_prompts = prompts[: int(os.environ.get("EVAL_PROMPTS", "12"))]
+        openai_client = model.openai_client()
+        max_steps = int(os.environ.get("NUM_STEPS", "20"))
+        rollouts_per_prompt = int(os.environ.get("ROLLOUTS_PER_PROMPT", "32"))
+        max_tokens = int(os.environ.get("MAX_TOKENS", "100"))
+        timeout = float(os.environ.get("TIMEOUT", "100"))
+        eval_every_n_steps = int(os.environ.get("EVAL_EVERY_N_STEPS", "1"))
+        learning_rate = float(os.environ.get("LEARNING_RATE", "1e-4"))
+
+        start_step = await model.get_step()
+        for offset in range(max_steps):
+            current_step = start_step + offset
+
+            if (
+                eval_every_n_steps > 0
+                and (current_step - start_step) % eval_every_n_steps == 0
+            ):
+                eval_builder = model.metrics_builder("eval")
+                with eval_builder.activate_context():
+                    with eval_builder.measure("time/step_eval_s"):
+                        val_groups = await evaluate(
+                            openai_client,
+                            model,
+                            eval_prompts,
+                            max_tokens=max_tokens,
+                            timeout=timeout,
+                        )
+                    eval_builder.add_data(
+                        step_actor_tokens=total_actor_tokens(val_groups)
+                    )
+                await model.log(val_groups, split="val", step=current_step)
+
+            train_builder = model.metrics_builder("train")
+            step_started = time.monotonic()
+            with train_builder.activate_context():
+                with train_builder.measure("time/step_actor_s"):
+                    train_groups = await art.gather_trajectory_groups(
+                        (
+                            art.TrajectoryGroup(
+                                rollout(
+                                    openai_client,
+                                    model,
+                                    prompt,
+                                    max_tokens=max_tokens,
+                                    timeout=timeout,
+                                )
+                                for _ in range(rollouts_per_prompt)
+                            )
+                            for prompt in prompts
+                        )
+                    )
+                train_builder.add_data(
+                    step_actor_tokens=total_actor_tokens(train_groups)
+                )
+                await model.train(
+                    train_groups,
+                    config=art.TrainConfig(learning_rate=learning_rate),
+                )
+
+            step = await model.get_step()
+            await model.log(
+                trajectories=None,
+                split="train",
+                step=step,
+                metrics={"time/step_wall_s": time.monotonic() - step_started},
+            )
+            print(f"step {step} complete")
+
+        print_history_summary(model)
+    finally:
+        await backend.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 68281b5235bd80b56e51dda9a48cb53401fad777 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 14:26:54 -0700
Subject: [PATCH 21/46] fix: Load MetricsBuilder state before builder access

---
 src/art/model.py                    |  1 +
 tests/unit/test_frontend_logging.py | 30 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/src/art/model.py b/src/art/model.py
index c52705c5..dcbdc5a6 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -554,6 +554,7 @@ def _add_default_step_metrics(
         }
 
     def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder:
+        self._load_metrics_builder_state()
         if cost_context is None:
             return self._metrics_builder
         return self._metrics_builder.for_cost_context(cost_context)
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 3230af03..5719009c 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -645,6 +645,36 @@ async def test_cost_cumulative_persists_across_model_recreation(
         assert second["costs/train/prefill_cum"] == pytest.approx(1.0)
         assert second["costs/all_cum"] == pytest.approx(1.0)
 
+    @pytest.mark.asyncio
+    async def test_metrics_builder_loads_resume_state_before_builder_use(
+        self, tmp_path: Path
+    ):
+        model_1 = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        model_1.metrics_builder().add_data(scenario_ids=["scenario-a"])
+        await model_1.log(trajectories=None, split="train", step=1, metrics={})
+
+        model_2 = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        model_2.metrics_builder().add_data(scenario_ids=["scenario-b"])
+        await model_2.log(trajectories=None, split="train", step=2, metrics={})
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            first = json.loads(f.readline())
+            second = json.loads(f.readline())
+
+        assert first["data/cum_num_unique_scenarios"] == pytest.approx(1.0)
+        assert second["data/cum_num_unique_scenarios"] == pytest.approx(2.0)
+
     @pytest.mark.asyncio
     async def test_direct_time_and_data_metrics_get_cumulative_variants(
         self, tmp_path: Path

From 7b8d8f9d3acd961cd05e12c77d21308b14c42b9b Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 14:28:25 -0700
Subject: [PATCH 22/46] fix: Preserve gradient step metrics in train outputs

---
 src/art/local/backend.py            |  2 +-
 src/art/model.py                    |  1 -
 src/art/serverless/backend.py       |  1 -
 tests/unit/test_frontend_logging.py | 91 +++++++++++++++++++++++++++++
 4 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index 6ff4a53a..bdf44179 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -588,7 +588,6 @@ async def train(  # type: ignore[override]
                 k: sum(d.get(k, 0) for d in training_metrics)
                 / sum(1 for d in training_metrics if k in d)
                 for k in {k for d in training_metrics for k in d}
-                if k != TRAIN_GRADIENT_STEPS_KEY
             }
         summary = summarize_trajectory_groups(groups_list)
         avg_metrics.setdefault(
@@ -838,6 +837,7 @@ async def _train_sft(
                 **result,
                 "data/step_num_trajectories": float(total_trajectories),
                 "data/step_trainer_tokens": float(total_trainable_tokens),
+                TRAIN_GRADIENT_STEPS_KEY: float(len(batches)),
                 "train/num_trajectories": float(total_trajectories),
                 "train/num_trainable_tokens": float(total_trainable_tokens),
             }
diff --git a/src/art/model.py b/src/art/model.py
index dcbdc5a6..4d6b4e32 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -972,7 +972,6 @@ async def train(
                 k: sum(d.get(k, 0) for d in training_metrics)
                 / sum(1 for d in training_metrics if k in d)
                 for k in {k for d in training_metrics for k in d}
-                if k != TRAIN_GRADIENT_STEPS_KEY
             }
         avg_metrics.setdefault("time/step_trainer_s", trainer_elapsed)
 
diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py
index ede695cc..5dd7cb0f 100644
--- a/src/art/serverless/backend.py
+++ b/src/art/serverless/backend.py
@@ -256,7 +256,6 @@ async def train(  # type: ignore[override]
                 k: sum(d.get(k, 0) for d in training_metrics)
                 / sum(1 for d in training_metrics if k in d)
                 for k in {k for d in training_metrics for k in d}
-                if k != TRAIN_GRADIENT_STEPS_KEY
             }
         summary = summarize_trajectory_groups(groups_list)
         avg_metrics.setdefault(
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 5719009c..b8d38852 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -17,6 +17,8 @@
 import pytest
 
 from art import Model, TrainableModel, Trajectory, TrajectoryGroup
+from art.local.backend import LocalBackend
+from art.metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY
 from art.utils.trajectory_logging import read_trajectory_groups_parquet
 
 
@@ -965,3 +967,92 @@ async def mock_train_sft(*args, **kwargs):
         assert not history_path.exists(), (
             "No history.jsonl should be created for empty training"
         )
+
+
+class TestGradientStepMetrics:
+    @pytest.mark.asyncio
+    async def test_model_train_logs_gradient_step_count(self, tmp_path: Path):
+        model = TrainableModel(
+            name="test-train",
+            project="test-project",
+            base_model="gpt-4",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        async def mock_train_model(*args, **kwargs):
+            for loss in (1.0, 0.8, 0.6):
+                yield {
+                    "loss/train": loss,
+                    TRAIN_GRADIENT_STEPS_KEY: 3.0,
+                }
+
+        mock_backend = MagicMock()
+        mock_backend._train_model = mock_train_model
+        mock_backend._get_step = AsyncMock(return_value=1)
+        model._backend = mock_backend
+
+        groups = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=1.0,
+                        messages_and_choices=[
+                            {"role": "user", "content": "hello"},
+                            {"role": "assistant", "content": "hi"},
+                        ],
+                    )
+                ]
+            )
+        ]
+
+        await model.train(groups)
+
+        history_path = tmp_path / "test-project/models/test-train/history.jsonl"
+        rows = [json.loads(line) for line in history_path.open() if line.strip()]
+        merged: dict[str, float] = {}
+        for row in rows:
+            merged.update(row)
+
+        assert merged[TRAIN_GRADIENT_STEPS_KEY] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_local_backend_train_returns_gradient_step_count(
+        self, tmp_path: Path
+    ):
+        model = TrainableModel(
+            name="test-backend-train",
+            project="test-project",
+            base_model="gpt-4",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        backend = LocalBackend(path=str(tmp_path))
+
+        async def mock_train_model(*args, **kwargs):
+            for loss in (1.0, 0.8):
+                yield {
+                    "loss/train": loss,
+                    TRAIN_GRADIENT_STEPS_KEY: 2.0,
+                }
+
+        backend._train_model = mock_train_model  # type: ignore[method-assign]
+        backend._get_step = AsyncMock(return_value=1)  # type: ignore[method-assign]
+
+        groups = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=1.0,
+                        messages_and_choices=[
+                            {"role": "user", "content": "hello"},
+                            {"role": "assistant", "content": "hi"},
+                        ],
+                    )
+                ]
+            )
+        ]
+
+        result = await backend.train(model, groups, save_checkpoint=False)
+
+        assert result.metrics[TRAIN_GRADIENT_STEPS_KEY] == pytest.approx(2.0)

From 004d6106780dc424416461ce3467e3a18bfc61b6 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 14:30:02 -0700
Subject: [PATCH 23/46] fix: Skip stale MetricsBuilder flush outputs

---
 src/art/metrics.py                  | 46 ++++++++++++++++++++---------
 tests/unit/test_frontend_logging.py | 38 ++++++++++++++++++++++++
 tests/unit/test_metrics_builder.py  | 14 +++++++++
 3 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/src/art/metrics.py b/src/art/metrics.py
index f31e0b6f..b65157a0 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -51,6 +51,7 @@ class _SharedMetricsState:
     step_buffer: dict[str, float]
     cum_state: dict[str, float]
     unique_scenario_ids: set[str]
+    pending_scenario_ids: set[str]
     cost_extractors: dict[str, CostExtractor]
     token_pricing: dict[str, TokenPricing]
 
@@ -61,6 +62,7 @@ def _new_shared_metrics_state() -> _SharedMetricsState:
         step_buffer={},
         cum_state={},
         unique_scenario_ids=set(),
+        pending_scenario_ids=set(),
         cost_extractors={},
         token_pricing=dict(_DEFAULT_TOKEN_PRICING),
     )
@@ -162,6 +164,7 @@ def __init__(
         self._step_buffer = self._shared_state.step_buffer
         self._cum_state = self._shared_state.cum_state
         self._unique_scenario_ids = self._shared_state.unique_scenario_ids
+        self._pending_scenario_ids = self._shared_state.pending_scenario_ids
         self._cost_extractors = self._shared_state.cost_extractors
         self._token_pricing = self._shared_state.token_pricing
 
@@ -187,7 +190,9 @@ def add_data(
         if step_actor_tokens is not None:
             self.add_metric("data/step_actor_tokens", float(step_actor_tokens))
         if scenario_ids is not None:
-            self._unique_scenario_ids.update(scenario_ids)
+            self._pending_scenario_ids.update(
+                str(scenario_id) for scenario_id in scenario_ids
+            )
 
     def add_user_timing(
         self,
@@ -245,13 +250,15 @@ async def flush(self, step: int) -> dict[str, float]:
                 self._cum_state[cum_key] = next_value
                 result[cum_key] = next_value
 
-            if self._unique_scenario_ids:
+            if self._pending_scenario_ids:
+                self._unique_scenario_ids.update(self._pending_scenario_ids)
                 result["data/cum_num_unique_scenarios"] = float(
                     len(self._unique_scenario_ids)
                 )
 
             self._update_throughput_metrics(result)
             self._step_buffer.clear()
+            self._pending_scenario_ids.clear()
             return result
 
     def activate(self) -> Token["MetricsBuilder"]:
@@ -319,11 +326,13 @@ def load_state_dict(self, state: dict[str, Any]) -> None:
         self._shared_state.cum_state.update(restored_cum_state)
         self._shared_state.unique_scenario_ids.clear()
         self._shared_state.unique_scenario_ids.update(restored_unique_ids)
+        self._shared_state.pending_scenario_ids.clear()
 
         # Keep local references aligned with the shared state so derived builders
         # created before or after resume observe the same cumulative state.
         self._cum_state = self._shared_state.cum_state
         self._unique_scenario_ids = self._shared_state.unique_scenario_ids
+        self._pending_scenario_ids = self._shared_state.pending_scenario_ids
 
     def _validate_and_add(self, key: str, value: float) -> None:
         if key.endswith("_cum"):
@@ -391,21 +400,30 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None:
             self._cum_state[cum_key] = next_value
             result[cum_key] = next_value
 
-        trainer_tokens = self._cum_state.get("data/step_trainer_tokens_cum")
-        trainer_seconds = self._cum_state.get("time/step_trainer_s_cum")
         if (
-            trainer_tokens is not None
-            and trainer_seconds is not None
-            and trainer_seconds > 0
+            "data/step_trainer_tokens" in result
+            or "time/step_trainer_s" in result
         ):
-            result["throughput/avg_trainer_tok_per_s"] = (
-                trainer_tokens / trainer_seconds
-            )
+            trainer_tokens = self._cum_state.get("data/step_trainer_tokens_cum")
+            trainer_seconds = self._cum_state.get("time/step_trainer_s_cum")
+            if (
+                trainer_tokens is not None
+                and trainer_seconds is not None
+                and trainer_seconds > 0
+            ):
+                result["throughput/avg_trainer_tok_per_s"] = (
+                    trainer_tokens / trainer_seconds
+                )
 
-        actor_tokens = self._cum_state.get("data/step_actor_tokens_cum")
-        actor_seconds = self._cum_state.get("time/step_actor_s_cum")
-        if actor_tokens is not None and actor_seconds is not None and actor_seconds > 0:
-            result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds
+        if "data/step_actor_tokens" in result or "time/step_actor_s" in result:
+            actor_tokens = self._cum_state.get("data/step_actor_tokens_cum")
+            actor_seconds = self._cum_state.get("time/step_actor_s_cum")
+            if (
+                actor_tokens is not None
+                and actor_seconds is not None
+                and actor_seconds > 0
+            ):
+                result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds
 
     def _resolve_token_pricing(
         self,
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index b8d38852..6d515b92 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -707,6 +707,44 @@ async def test_direct_time_and_data_metrics_get_cumulative_variants(
         assert entry["data/step_actor_tokens"] == pytest.approx(10)
         assert entry["data/step_actor_tokens_cum"] == pytest.approx(10)
 
+    @pytest.mark.asyncio
+    async def test_log_without_new_builder_metrics_skips_extra_taxonomy_row(
+        self, tmp_path: Path
+    ):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        model.metrics_builder().add_data(scenario_ids=["scenario-a"])
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=1,
+            metrics={
+                "time/step_trainer_s": 2.0,
+                "data/step_trainer_tokens": 20.0,
+            },
+        )
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=2,
+            metrics={"loss/train": 1.0},
+        )
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        rows = [json.loads(line) for line in history_path.open() if line.strip()]
+
+        assert len(rows) == 2
+        assert rows[0]["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
+        assert rows[0]["data/cum_num_unique_scenarios"] == pytest.approx(1.0)
+        assert rows[1]["loss/train"] == pytest.approx(1.0)
+        assert "throughput/avg_trainer_tok_per_s" not in rows[1]
+        assert "data/cum_num_unique_scenarios" not in rows[1]
+
 
 class TestWandbIntegration:
     """Test wandb integration logic (without mocking wandb itself)."""
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
index 6b032c05..1746c8f2 100644
--- a/tests/unit/test_metrics_builder.py
+++ b/tests/unit/test_metrics_builder.py
@@ -193,6 +193,20 @@ async def test_unique_scenario_count_tracks_exact_ids(self) -> None:
         second = await builder.flush(step=2)
         assert second["data/cum_num_unique_scenarios"] == 4
 
+    @pytest.mark.asyncio
+    async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_metric("time/step_trainer_s", 2.0)
+        builder.add_metric("data/step_trainer_tokens", 20.0)
+        builder.add_data(scenario_ids=["s1"])
+
+        first = await builder.flush(step=1)
+        assert first["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
+        assert first["data/cum_num_unique_scenarios"] == 1
+
+        second = await builder.flush(step=2)
+        assert second == {}
+
     @pytest.mark.asyncio
     async def test_concurrent_add_cost_calls_do_not_lose_updates(self) -> None:
         builder = MetricsBuilder(cost_context="train")

From b9733666df6514e8228ddf5812dbbf36b6b8d6a0 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 14:48:36 -0700
Subject: [PATCH 24/46] fix: Normalize Model.log inputs once

---
 src/art/model.py                    | 43 ++++++++++++---------
 tests/unit/test_frontend_logging.py | 59 +++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/src/art/model.py b/src/art/model.py
index 4d6b4e32..40818465 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -578,6 +578,25 @@ def _persist_metrics_builder_state(self) -> None:
             {METRICS_BUILDER_STATE_KEY: self._metrics_builder.state_dict()}
         )
 
+    def _normalize_trajectory_groups(
+        self,
+        trajectories: Iterable[Trajectory | BaseException] | Iterable[TrajectoryGroup],
+    ) -> list[TrajectoryGroup]:
+        items = list(trajectories)
+        if not items:
+            return []
+
+        if all(isinstance(item, TrajectoryGroup) for item in items):
+            return cast(list[TrajectoryGroup], items)
+
+        if all(isinstance(item, (Trajectory, BaseException)) for item in items):
+            return [TrajectoryGroup(cast(Iterable[Trajectory | BaseException], items))]
+
+        raise TypeError(
+            "trajectories must be an iterable of TrajectoryGroup objects or "
+            "an iterable of Trajectory/BaseException items"
+        )
+
     async def log(
         self,
         trajectories: (
@@ -622,17 +641,7 @@ async def log(
                 self._persist_metrics_builder_state()
             return
 
-        # Convert to list[TrajectoryGroup]
-        if any(isinstance(t, Trajectory) for t in trajectories) or any(
-            isinstance(t, BaseException) for t in trajectories
-        ):
-            trajectory_groups = [
-                TrajectoryGroup(
-                    cast(Iterable[Trajectory | BaseException], trajectories)
-                )
-            ]
-        else:
-            trajectory_groups = cast(list[TrajectoryGroup], list(trajectories))
+        trajectory_groups = self._normalize_trajectory_groups(trajectories)
 
         default_train_metrics = self._add_default_step_metrics(
             trajectory_groups,
@@ -676,13 +685,11 @@ async def log(
                     if metric not in group_metrics:
                         group_metrics[metric] = []
                     group_metrics[metric].append(float(value))
-            for trajectory in group:
-                if isinstance(trajectory, BaseException):
-                    all_metrics[exception_rate_key].append(1)
-                    continue
-                else:
-                    all_metrics[exception_rate_key].append(0)
-                # Add reward metric
+
+            all_metrics[exception_rate_key].extend(0.0 for _ in group.trajectories)
+            all_metrics[exception_rate_key].extend(1.0 for _ in group.exceptions)
+
+            for trajectory in group.trajectories:
                 all_metrics[reward_key].append(trajectory.reward)
 
                 # Collect other custom metrics
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 6d515b92..7f689c42 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -470,6 +470,65 @@ async def test_exception_rate_calculation(self, tmp_path: Path):
         # All successful trajectories = 0% exception rate
         assert entry["val/exception_rate"] == 0.0
 
+    @pytest.mark.asyncio
+    async def test_exception_rate_counts_group_exceptions(self, tmp_path: Path):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        trajectory_groups = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.5,
+                        messages_and_choices=[{"role": "user", "content": "test"}],
+                    )
+                ],
+                exceptions=[ValueError("boom")],
+            )
+        ]
+
+        await model.log(trajectory_groups, split="val")
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["val/exception_rate"] == pytest.approx(0.5)
+
+    @pytest.mark.asyncio
+    async def test_generator_of_trajectories_is_consumed_once(self, tmp_path: Path):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        def trajectories():
+            yield Trajectory(
+                reward=1.0,
+                metrics={"custom": 1.0},
+                messages_and_choices=[{"role": "user", "content": "first"}],
+            )
+            yield Trajectory(
+                reward=3.0,
+                metrics={"custom": 3.0},
+                messages_and_choices=[{"role": "user", "content": "second"}],
+            )
+
+        await model.log(trajectories(), split="val")
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["val/reward"] == pytest.approx(2.0)
+        assert entry["val/custom"] == pytest.approx(2.0)
+
     @pytest.mark.asyncio
     async def test_train_trajectory_metrics_default_to_reward_prefix(
         self, tmp_path: Path

From 18417e3b74685d29d6716dbe24b20b457d97d8c6 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 14:49:41 -0700
Subject: [PATCH 25/46] refactor: Share training metric aggregation helpers

---
 src/art/local/backend.py            | 30 ++++++++--------------
 src/art/metrics_taxonomy.py         | 26 +++++++++++++++++++
 src/art/model.py                    | 15 +++--------
 src/art/serverless/backend.py       | 30 ++++++++--------------
 src/art/tinker_native/backend.py    |  9 ++++---
 tests/unit/test_metrics_taxonomy.py | 40 +++++++++++++++++++++++++++++
 6 files changed, 96 insertions(+), 54 deletions(-)
 create mode 100644 tests/unit/test_metrics_taxonomy.py

diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index bdf44179..1d839cf7 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -42,8 +42,8 @@
 from ..backend import AnyTrainableModel, Backend
 from ..metrics_taxonomy import (
     TRAIN_GRADIENT_STEPS_KEY,
-    build_data_metrics_from_summary,
-    build_train_metrics_from_summary,
+    average_metric_samples,
+    build_training_summary_metrics,
     rename_train_metrics,
     summarize_trajectory_groups,
 )
@@ -582,13 +582,7 @@ async def train(  # type: ignore[override]
             training_metrics.append(metrics)
 
         # Aggregate metrics
-        avg_metrics: dict[str, float] = {}
-        if training_metrics:
-            avg_metrics = {
-                k: sum(d.get(k, 0) for d in training_metrics)
-                / sum(1 for d in training_metrics if k in d)
-                for k in {k for d in training_metrics for k in d}
-            }
+        avg_metrics = average_metric_samples(training_metrics)
         summary = summarize_trajectory_groups(groups_list)
         avg_metrics.setdefault(
             "time/step_trainer_s", time.monotonic() - trainer_started
@@ -596,12 +590,10 @@ async def train(  # type: ignore[override]
         avg_metrics.update(
             {
                 key: value
-                for key, value in {
-                    **build_data_metrics_from_summary(
-                        summary, include_trainable_groups=True
-                    ),
-                    **build_train_metrics_from_summary(summary),
-                }.items()
+                for key, value in build_training_summary_metrics(
+                    summary,
+                    include_trainable_groups=True,
+                ).items()
                 if key not in avg_metrics
             }
         )
@@ -643,10 +635,10 @@ async def _train_model(
             print("Packing tensors...")
 
         summary = summarize_trajectory_groups(trajectory_groups)
-        base_metrics = {
-            **build_data_metrics_from_summary(summary, include_trainable_groups=True),
-            **build_train_metrics_from_summary(summary),
-        }
+        base_metrics = build_training_summary_metrics(
+            summary,
+            include_trainable_groups=True,
+        )
 
         packed_tensors = self._get_packed_tensors(
             model,
diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py
index 79a175c3..061e0872 100644
--- a/src/art/metrics_taxonomy.py
+++ b/src/art/metrics_taxonomy.py
@@ -45,6 +45,18 @@ def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]:
     return {rename_train_metric_key(key): float(value) for key, value in metrics.items()}
 
 
+def average_metric_samples(metric_samples: Iterable[dict[str, float]]) -> dict[str, float]:
+    totals: dict[str, float] = {}
+    counts: dict[str, int] = {}
+
+    for sample in metric_samples:
+        for key, value in sample.items():
+            totals[key] = totals.get(key, 0.0) + float(value)
+            counts[key] = counts.get(key, 0) + 1
+
+    return {key: totals[key] / counts[key] for key in totals}
+
+
 @dataclass(frozen=True)
 class TrajectoryBatchSummary:
     num_scenarios: int
@@ -102,6 +114,20 @@ def build_train_metrics_from_summary(
     }
 
 
+def build_training_summary_metrics(
+    summary: TrajectoryBatchSummary,
+    *,
+    include_trainable_groups: bool,
+) -> dict[str, float]:
+    return {
+        **build_data_metrics_from_summary(
+            summary,
+            include_trainable_groups=include_trainable_groups,
+        ),
+        **build_train_metrics_from_summary(summary),
+    }
+
+
 def _group_is_trainable(group: TrajectoryGroup) -> bool:
     rewards = [trajectory.reward for trajectory in group.trajectories]
     return len(rewards) > 1 and len(set(rewards)) > 1
diff --git a/src/art/model.py b/src/art/model.py
index 40818465..625408d1 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -18,6 +18,7 @@
 from .metrics import MetricsBuilder
 from .metrics_taxonomy import (
     TRAIN_GRADIENT_STEPS_KEY,
+    average_metric_samples,
     build_data_metrics_from_summary,
     build_train_metrics_from_summary,
     summarize_trajectory_groups,
@@ -973,13 +974,7 @@ async def train(
         trainer_elapsed = time.monotonic() - trainer_started
 
         # 2. Calculate aggregated training metrics
-        avg_metrics: dict[str, float] = {}
-        if training_metrics:
-            avg_metrics = {
-                k: sum(d.get(k, 0) for d in training_metrics)
-                / sum(1 for d in training_metrics if k in d)
-                for k in {k for d in training_metrics for k in d}
-            }
+        avg_metrics = average_metric_samples(training_metrics)
         avg_metrics.setdefault("time/step_trainer_s", trainer_elapsed)
 
         # 3. Log trajectories and training metrics together (single wandb log call)
@@ -1024,11 +1019,7 @@ async def train_sft(
 
         # Log aggregated training metrics once (same as RL)
         if training_metrics:
-            avg_metrics = {
-                k: sum(d.get(k, 0) for d in training_metrics)
-                / sum(1 for d in training_metrics if k in d)
-                for k in {k for d in training_metrics for k in d}
-            }
+            avg_metrics = average_metric_samples(training_metrics)
             avg_metrics["time/step_trainer_s"] = trainer_elapsed
             # Get the current step after training
             step = await self.get_step()
diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py
index 5dd7cb0f..d0589f7f 100644
--- a/src/art/serverless/backend.py
+++ b/src/art/serverless/backend.py
@@ -12,8 +12,8 @@
 from ..backend import AnyTrainableModel, Backend
 from ..metrics_taxonomy import (
     TRAIN_GRADIENT_STEPS_KEY,
-    build_data_metrics_from_summary,
-    build_train_metrics_from_summary,
+    average_metric_samples,
+    build_training_summary_metrics,
     rename_train_metrics,
     summarize_trajectory_groups,
 )
@@ -250,13 +250,7 @@ async def train(  # type: ignore[override]
             training_metrics.append(metrics)
 
         # Aggregate metrics
-        avg_metrics: dict[str, float] = {}
-        if training_metrics:
-            avg_metrics = {
-                k: sum(d.get(k, 0) for d in training_metrics)
-                / sum(1 for d in training_metrics if k in d)
-                for k in {k for d in training_metrics for k in d}
-            }
+        avg_metrics = average_metric_samples(training_metrics)
         summary = summarize_trajectory_groups(groups_list)
         avg_metrics.setdefault(
             "time/step_trainer_s", time.monotonic() - trainer_started
@@ -264,12 +258,10 @@ async def train(  # type: ignore[override]
         avg_metrics.update(
             {
                 key: value
-                for key, value in {
-                    **build_data_metrics_from_summary(
-                        summary, include_trainable_groups=True
-                    ),
-                    **build_train_metrics_from_summary(summary),
-                }.items()
+                for key, value in build_training_summary_metrics(
+                    summary,
+                    include_trainable_groups=True,
+                ).items()
                 if key not in avg_metrics
             }
         )
@@ -300,10 +292,10 @@ async def _train_model(
         verbose: bool = False,
     ) -> AsyncIterator[dict[str, float]]:
         summary = summarize_trajectory_groups(trajectory_groups)
-        base_metrics = {
-            **build_data_metrics_from_summary(summary, include_trainable_groups=True),
-            **build_train_metrics_from_summary(summary),
-        }
+        base_metrics = build_training_summary_metrics(
+            summary,
+            include_trainable_groups=True,
+        )
         assert model.id is not None, "Model ID is required"
         training_job = await self._client.training_jobs.create(  # ty:ignore[possibly-missing-attribute]
             model_id=model.id,
diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py
index 0c6a1654..aeb41e1c 100644
--- a/src/art/tinker_native/backend.py
+++ b/src/art/tinker_native/backend.py
@@ -31,8 +31,7 @@
 from ..backend import Backend
 from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing
 from ..metrics_taxonomy import (
-    build_data_metrics_from_summary,
-    build_train_metrics_from_summary,
+    build_training_summary_metrics,
     rename_train_metric_key,
     summarize_trajectory_groups,
 )
@@ -224,8 +223,10 @@ async def train(  # type: ignore[override]
         )
 
         metrics: dict[str, float] = {
-            **build_data_metrics_from_summary(summary, include_trainable_groups=True),
-            **build_train_metrics_from_summary(summary),
+            **build_training_summary_metrics(
+                summary,
+                include_trainable_groups=True,
+            ),
             "data/step_num_datums": float(len(datums)),
         }
 
diff --git a/tests/unit/test_metrics_taxonomy.py b/tests/unit/test_metrics_taxonomy.py
new file mode 100644
index 00000000..45085476
--- /dev/null
+++ b/tests/unit/test_metrics_taxonomy.py
@@ -0,0 +1,40 @@
+import pytest
+
+from art.metrics_taxonomy import (
+    TrajectoryBatchSummary,
+    average_metric_samples,
+    build_training_summary_metrics,
+)
+
+
+def test_average_metric_samples_handles_sparse_keys() -> None:
+    averaged = average_metric_samples(
+        [
+            {"loss/train": 1.0, "loss/grad_norm": 0.5},
+            {"loss/train": 0.5},
+            {"loss/grad_norm": 1.0},
+        ]
+    )
+
+    assert averaged["loss/train"] == pytest.approx(0.75)
+    assert averaged["loss/grad_norm"] == pytest.approx(0.75)
+
+
+def test_build_training_summary_metrics_includes_data_and_train_sections() -> None:
+    summary = TrajectoryBatchSummary(
+        num_scenarios=2,
+        num_trajectories=5,
+        num_groups_submitted=2,
+        num_groups_trainable=1,
+        scenario_ids=["a", "b"],
+    )
+
+    metrics = build_training_summary_metrics(
+        summary,
+        include_trainable_groups=True,
+    )
+
+    assert metrics["data/step_num_scenarios"] == pytest.approx(2.0)
+    assert metrics["data/step_num_groups_trainable"] == pytest.approx(1.0)
+    assert metrics["train/num_groups_submitted"] == pytest.approx(2.0)
+    assert metrics["train/num_trajectories"] == pytest.approx(5.0)

From ffe815e2a5d94937b84e4b82aeabfed18b0cc473 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 14:52:22 -0700
Subject: [PATCH 26/46] refactor: Simplify MetricsBuilder state access

---
 src/art/metrics.py                 | 84 +++++++++++++++---------------
 src/art/model.py                   |  4 +-
 tests/unit/test_metrics_builder.py | 40 +++++++-------
 tests/unit/test_track_api_cost.py  |  8 +--
 4 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/src/art/metrics.py b/src/art/metrics.py
index b65157a0..809d5061 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -160,13 +160,6 @@ def __init__(
         self._shared_state = (
             _shared_state if _shared_state is not None else _new_shared_metrics_state()
         )
-        self._lock = self._shared_state.lock
-        self._step_buffer = self._shared_state.step_buffer
-        self._cum_state = self._shared_state.cum_state
-        self._unique_scenario_ids = self._shared_state.unique_scenario_ids
-        self._pending_scenario_ids = self._shared_state.pending_scenario_ids
-        self._cost_extractors = self._shared_state.cost_extractors
-        self._token_pricing = self._shared_state.token_pricing
 
     def add_cost(self, path: str, usd: float) -> None:
         if not path:
@@ -190,7 +183,7 @@ def add_data(
         if step_actor_tokens is not None:
             self.add_metric("data/step_actor_tokens", float(step_actor_tokens))
         if scenario_ids is not None:
-            self._pending_scenario_ids.update(
+            self._shared_state.pending_scenario_ids.update(
                 str(scenario_id) for scenario_id in scenario_ids
             )
 
@@ -228,15 +221,14 @@ def measure(self, key: str):
         finally:
             self.add_metric(key, time.monotonic() - started)
 
-    async def flush(self, step: int) -> dict[str, float]:
-        del step
-        async with self._lock:
+    async def flush(self) -> dict[str, float]:
+        async with self._shared_state.lock:
             self._validate_hierarchy()
 
-            result = dict(self._step_buffer)
+            result = dict(self._shared_state.step_buffer)
             cost_metrics = {
                 key: value
-                for key, value in self._step_buffer.items()
+                for key, value in self._shared_state.step_buffer.items()
                 if key.startswith("costs/")
             }
             result.update(self._compute_rollups(cost_metrics))
@@ -246,19 +238,21 @@ async def flush(self, step: int) -> dict[str, float]:
                 if section not in _HIERARCHICAL_SECTIONS:
                     continue
                 cum_key = f"{key}_cum"
-                next_value = self._cum_state.get(cum_key, 0.0) + value
-                self._cum_state[cum_key] = next_value
+                next_value = self._shared_state.cum_state.get(cum_key, 0.0) + value
+                self._shared_state.cum_state[cum_key] = next_value
                 result[cum_key] = next_value
 
-            if self._pending_scenario_ids:
-                self._unique_scenario_ids.update(self._pending_scenario_ids)
+            if self._shared_state.pending_scenario_ids:
+                self._shared_state.unique_scenario_ids.update(
+                    self._shared_state.pending_scenario_ids
+                )
                 result["data/cum_num_unique_scenarios"] = float(
-                    len(self._unique_scenario_ids)
+                    len(self._shared_state.unique_scenario_ids)
                 )
 
             self._update_throughput_metrics(result)
-            self._step_buffer.clear()
-            self._pending_scenario_ids.clear()
+            self._shared_state.step_buffer.clear()
+            self._shared_state.pending_scenario_ids.clear()
             return result
 
     def activate(self) -> Token["MetricsBuilder"]:
@@ -293,7 +287,7 @@ def register_cost_extractor(
         normalized_provider = _normalize_provider(provider)
         if normalized_provider is None:
             raise ValueError("provider must be non-empty")
-        self._cost_extractors[normalized_provider] = extractor
+        self._shared_state.cost_extractors[normalized_provider] = extractor
 
     def register_token_pricing(
         self,
@@ -305,15 +299,15 @@ def register_token_pricing(
         normalized_provider = _normalize_provider(provider)
         if normalized_provider is None:
             raise ValueError("provider must be non-empty")
-        self._token_pricing[normalized_provider] = TokenPricing(
+        self._shared_state.token_pricing[normalized_provider] = TokenPricing(
             prompt_per_million=float(prompt_per_million),
             completion_per_million=float(completion_per_million),
         )
 
     def state_dict(self) -> dict[str, Any]:
         return {
-            "cum_state": dict(self._cum_state),
-            "unique_scenario_ids": list(self._unique_scenario_ids),
+            "cum_state": dict(self._shared_state.cum_state),
+            "unique_scenario_ids": list(self._shared_state.unique_scenario_ids),
         }
 
     def load_state_dict(self, state: dict[str, Any]) -> None:
@@ -328,19 +322,13 @@ def load_state_dict(self, state: dict[str, Any]) -> None:
         self._shared_state.unique_scenario_ids.update(restored_unique_ids)
         self._shared_state.pending_scenario_ids.clear()
 
-        # Keep local references aligned with the shared state so derived builders
-        # created before or after resume observe the same cumulative state.
-        self._cum_state = self._shared_state.cum_state
-        self._unique_scenario_ids = self._shared_state.unique_scenario_ids
-        self._pending_scenario_ids = self._shared_state.pending_scenario_ids
-
     def _validate_and_add(self, key: str, value: float) -> None:
         if key.endswith("_cum"):
             raise ValueError(
                 f"Metric key '{key}' ends with '_cum', which is reserved for cumulative metrics."
             )
 
-        for existing_key in self._step_buffer:
+        for existing_key in self._shared_state.step_buffer:
             if existing_key == key:
                 continue
             if existing_key.startswith(f"{key}/"):
@@ -352,10 +340,14 @@ def _validate_and_add(self, key: str, value: float) -> None:
                     f"Cannot log '{key}' as a leaf: '{existing_key}' is already a leaf ancestor."
                 )
 
-        self._step_buffer[key] = self._step_buffer.get(key, 0.0) + value
+        self._shared_state.step_buffer[key] = (
+            self._shared_state.step_buffer.get(key, 0.0) + value
+        )
 
     def _validate_hierarchy(self) -> None:
-        keys = sorted(k for k in self._step_buffer if k.startswith("costs/"))
+        keys = sorted(
+            k for k in self._shared_state.step_buffer if k.startswith("costs/")
+        )
         for i, key in enumerate(keys):
             for other in keys[i + 1 :]:
                 if other.startswith(f"{key}/"):
@@ -396,16 +388,22 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None:
         for step_key, cum_key in _THROUGHPUT_IDLE_MAPPINGS.items():
             if step_key not in result:
                 continue
-            next_value = self._cum_state.get(cum_key, 0.0) + result[step_key]
-            self._cum_state[cum_key] = next_value
+            next_value = (
+                self._shared_state.cum_state.get(cum_key, 0.0) + result[step_key]
+            )
+            self._shared_state.cum_state[cum_key] = next_value
             result[cum_key] = next_value
 
         if (
             "data/step_trainer_tokens" in result
             or "time/step_trainer_s" in result
         ):
-            trainer_tokens = self._cum_state.get("data/step_trainer_tokens_cum")
-            trainer_seconds = self._cum_state.get("time/step_trainer_s_cum")
+            trainer_tokens = self._shared_state.cum_state.get(
+                "data/step_trainer_tokens_cum"
+            )
+            trainer_seconds = self._shared_state.cum_state.get(
+                "time/step_trainer_s_cum"
+            )
             if (
                 trainer_tokens is not None
                 and trainer_seconds is not None
@@ -416,8 +414,10 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None:
                 )
 
         if "data/step_actor_tokens" in result or "time/step_actor_s" in result:
-            actor_tokens = self._cum_state.get("data/step_actor_tokens_cum")
-            actor_seconds = self._cum_state.get("time/step_actor_s_cum")
+            actor_tokens = self._shared_state.cum_state.get(
+                "data/step_actor_tokens_cum"
+            )
+            actor_seconds = self._shared_state.cum_state.get("time/step_actor_s_cum")
             if (
                 actor_tokens is not None
                 and actor_seconds is not None
@@ -433,9 +433,9 @@ def _resolve_token_pricing(
         completion_price_per_million: float | None = None,
     ) -> TokenPricing:
         normalized_provider = _normalize_provider(provider) or _DEFAULT_PROVIDER
-        default_pricing = self._token_pricing.get(
+        default_pricing = self._shared_state.token_pricing.get(
             normalized_provider,
-            self._token_pricing[_DEFAULT_PROVIDER],
+            self._shared_state.token_pricing[_DEFAULT_PROVIDER],
         )
         return TokenPricing(
             prompt_per_million=(
@@ -460,7 +460,7 @@ def _extract_api_cost(
     ) -> float | None:
         provider_name = _normalize_provider(provider) or _detect_provider(response)
         if provider_name is not None:
-            custom_extractor = self._cost_extractors.get(provider_name)
+            custom_extractor = self._shared_state.cost_extractors.get(provider_name)
             if custom_extractor is not None:
                 custom_cost = custom_extractor(response)
                 if custom_cost is not None:
diff --git a/src/art/model.py b/src/art/model.py
index 625408d1..ff0d75e3 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -636,7 +636,7 @@ async def log(
                 metrics_without_costs = self._extract_non_cost_metrics(metrics, split)
                 if metrics_without_costs:
                     self._log_metrics(metrics_without_costs, split, step)
-                costs = await self._metrics_builder.flush(step)
+                costs = await self._metrics_builder.flush()
                 if costs:
                     self._log_metrics(costs, split, step)
                 self._persist_metrics_builder_state()
@@ -742,7 +742,7 @@ async def log(
         self._log_metrics(averages, split, step)
 
         # 4. Log cumulative costs
-        costs = await self._metrics_builder.flush(step)
+        costs = await self._metrics_builder.flush()
         if costs:
             self._log_metrics(costs, split, step)
         self._persist_metrics_builder_state()
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
index 1746c8f2..1ef7cbae 100644
--- a/tests/unit/test_metrics_builder.py
+++ b/tests/unit/test_metrics_builder.py
@@ -15,7 +15,7 @@ async def test_rollup_correctness_across_depths(self) -> None:
         builder.add_cost("train/tinker_inference", usd=0.45)
         builder.add_cost("eval/llm_judge/correctness", usd=0.06)
 
-        metrics = await builder.flush(step=1)
+        metrics = await builder.flush()
 
         assert metrics["costs/train/llm_judge"] == pytest.approx(0.12)
         assert metrics["costs/train"] == pytest.approx(1.77)
@@ -35,7 +35,7 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None:
             step_actor_tokens=10,
             scenario_ids=["a", "b"],
         )
-        first = await builder.flush(step=1)
+        first = await builder.flush()
 
         assert first["time/step_wall_s_cum"] == pytest.approx(1.5)
         assert first["time/step_actor_s_cum"] == pytest.approx(0.3)
@@ -49,7 +49,7 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None:
             step_actor_tokens=5,
             scenario_ids=["b", "c"],
         )
-        second = await builder.flush(step=2)
+        second = await builder.flush()
 
         assert second["time/step_wall_s_cum"] == pytest.approx(2.0)
         assert second["time/step_actor_s_cum"] == pytest.approx(0.5)
@@ -68,7 +68,7 @@ async def test_helper_metrics_accumulate_within_a_single_step(self) -> None:
         builder.add_idle_times(step_trainer_idle_s=1.0, step_actor_idle_s=2.0)
         builder.add_idle_times(step_trainer_idle_s=0.5, step_actor_idle_s=1.0)
 
-        metrics = await builder.flush(step=1)
+        metrics = await builder.flush()
 
         assert metrics["data/step_num_scenarios"] == pytest.approx(5)
         assert metrics["data/step_actor_tokens"] == pytest.approx(15)
@@ -88,7 +88,7 @@ async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) -
         builder.add_metric("data/step_actor_tokens", 10.0)
         builder.add_idle_times(step_trainer_idle_s=1.5, step_actor_idle_s=0.5)
 
-        metrics = await builder.flush(step=1)
+        metrics = await builder.flush()
 
         assert metrics["throughput/cum_trainer_idle_s"] == pytest.approx(1.5)
         assert metrics["throughput/cum_actor_idle_s"] == pytest.approx(0.5)
@@ -99,13 +99,13 @@ async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) -
     async def test_costs_all_generated_for_single_and_multiple_children(self) -> None:
         single = MetricsBuilder(cost_context="train")
         single.add_cost("train/gpu", usd=2.0)
-        one = await single.flush(step=1)
+        one = await single.flush()
         assert one["costs/all"] == pytest.approx(2.0)
 
         multi = MetricsBuilder(cost_context="train")
         multi.add_cost("train/gpu", usd=2.0)
         multi.add_cost("eval/llm_judge/correctness", usd=0.5)
-        two = await multi.flush(step=1)
+        two = await multi.flush()
         assert two["costs/all"] == pytest.approx(2.5)
 
     def test_leaf_parent_conflicts_raise(self) -> None:
@@ -125,7 +125,7 @@ async def test_duplicate_leaf_writes_are_summed(self) -> None:
         builder.add_cost("train/gpu", usd=1.25)
         builder.add_cost("train/gpu", usd=0.75)
 
-        metrics = await builder.flush(step=1)
+        metrics = await builder.flush()
 
         assert metrics["costs/train/gpu"] == pytest.approx(2.0)
         assert metrics["costs/train"] == pytest.approx(2.0)
@@ -140,14 +140,14 @@ def test_cum_suffix_is_reserved(self) -> None:
     async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None:
         builder = MetricsBuilder(cost_context="train")
         builder.add_cost("train/gpu", usd=1.0)
-        first = await builder.flush(step=1)
+        first = await builder.flush()
         assert first["costs/train_cum"] == pytest.approx(1.0)
 
-        second = await builder.flush(step=2)
+        second = await builder.flush()
         assert not any(key.startswith("costs/") for key in second)
 
         builder.add_cost("train/gpu", usd=2.0)
-        third = await builder.flush(step=3)
+        third = await builder.flush()
         assert third["costs/train"] == pytest.approx(2.0)
         assert third["costs/train_cum"] == pytest.approx(3.0)
 
@@ -155,14 +155,14 @@ async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None:
     async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None:
         before = MetricsBuilder(cost_context="train")
         before.add_cost("train/gpu", usd=1.0)
-        await before.flush(step=1)
+        await before.flush()
 
         state = before.state_dict()
         after = MetricsBuilder(cost_context="train")
         after.load_state_dict(state)
         after.add_cost("train/gpu", usd=2.0)
 
-        metrics = await after.flush(step=2)
+        metrics = await after.flush()
         assert metrics["costs/train_cum"] == pytest.approx(3.0)
         assert metrics["costs/all_cum"] == pytest.approx(3.0)
 
@@ -170,7 +170,7 @@ async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None:
     async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None:
         before = MetricsBuilder(cost_context="train")
         before.add_cost("train/gpu", usd=1.0)
-        await before.flush(step=1)
+        await before.flush()
 
         after = MetricsBuilder(cost_context="train")
         after.load_state_dict(before.state_dict())
@@ -178,7 +178,7 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None:
         eval_builder = after.for_cost_context("eval")
         eval_builder.add_cost("eval/judge", usd=2.0)
 
-        metrics = await eval_builder.flush(step=2)
+        metrics = await eval_builder.flush()
         assert metrics["costs/eval/judge"] == pytest.approx(2.0)
         assert metrics["costs/all_cum"] == pytest.approx(3.0)
 
@@ -186,11 +186,11 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None:
     async def test_unique_scenario_count_tracks_exact_ids(self) -> None:
         builder = MetricsBuilder(cost_context="train")
         builder.add_data(scenario_ids=["s1", "s2", "s3"])
-        first = await builder.flush(step=1)
+        first = await builder.flush()
         assert first["data/cum_num_unique_scenarios"] == 3
 
         builder.add_data(scenario_ids=["s2", "s4"])
-        second = await builder.flush(step=2)
+        second = await builder.flush()
         assert second["data/cum_num_unique_scenarios"] == 4
 
     @pytest.mark.asyncio
@@ -200,11 +200,11 @@ async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None:
         builder.add_metric("data/step_trainer_tokens", 20.0)
         builder.add_data(scenario_ids=["s1"])
 
-        first = await builder.flush(step=1)
+        first = await builder.flush()
         assert first["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
         assert first["data/cum_num_unique_scenarios"] == 1
 
-        second = await builder.flush(step=2)
+        second = await builder.flush()
         assert second == {}
 
     @pytest.mark.asyncio
@@ -217,7 +217,7 @@ async def worker() -> None:
                 await asyncio.sleep(0)
 
         await asyncio.gather(*(worker() for _ in range(4)))
-        metrics = await builder.flush(step=1)
+        metrics = await builder.flush()
 
         assert metrics["costs/train/gpu"] == pytest.approx(10.0)
         assert metrics["costs/all"] == pytest.approx(10.0)
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
index 09915d12..897a57d2 100644
--- a/tests/unit/test_track_api_cost.py
+++ b/tests/unit/test_track_api_cost.py
@@ -52,7 +52,7 @@ async def _judge() -> _OpenAIResponse:
         finally:
             token.var.reset(token)
 
-        metrics = await builder.flush(step=1)
+        metrics = await builder.flush()
         assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002)
 
     @pytest.mark.asyncio
@@ -74,7 +74,7 @@ async def _judge() -> _AnthropicResponse:
         finally:
             token.var.reset(token)
 
-        metrics = await builder.flush(step=1)
+        metrics = await builder.flush()
         assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062)
 
     @pytest.mark.asyncio
@@ -97,7 +97,7 @@ async def _judge() -> _OpenAIResponse:
         finally:
             token.var.reset(token)
 
-        metrics = await builder.flush(step=1)
+        metrics = await builder.flush()
         assert metrics["costs/train/llm_judge/custom"] == pytest.approx(0.75)
 
     @pytest.mark.asyncio
@@ -129,7 +129,7 @@ async def _judge() -> _OpenAIResponse:
         finally:
             token.var.reset(token)
 
-        metrics = await builder.flush(step=1)
+        metrics = await builder.flush()
         assert metrics["costs/eval/llm_judge/correctness"] == pytest.approx(0.0002)
 
 

From 9ee7a5968114aabfb98d7e27e0ac7d16c06f34c0 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 14:56:34 -0700
Subject: [PATCH 27/46] refactor: Extract API cost tracking helpers

---
 src/art/metrics.py                 | 288 ++++-------------------------
 src/art/metrics_api_cost.py        | 252 +++++++++++++++++++++++++
 tests/unit/test_metrics_builder.py |  18 ++
 3 files changed, 309 insertions(+), 249 deletions(-)
 create mode 100644 src/art/metrics_api_cost.py

diff --git a/src/art/metrics.py b/src/art/metrics.py
index 809d5061..bb23f510 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -1,16 +1,19 @@
 from __future__ import annotations
 
 import asyncio
-from collections.abc import Callable
 from contextlib import contextmanager
 from contextvars import ContextVar, Token
 from dataclasses import dataclass
-from functools import wraps
-from inspect import iscoroutinefunction
 import time
-from typing import Any, ParamSpec, TypeVar
+from typing import Any
 
-from .costs import tokens_to_cost
+from .metrics_api_cost import (
+    DEFAULT_TOKEN_PRICING,
+    CostExtractor,
+    TokenPricing,
+    extract_api_cost,
+    normalize_provider,
+)
 
 _active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder")
 
@@ -19,30 +22,6 @@
     "throughput/step_trainer_idle_s": "throughput/cum_trainer_idle_s",
     "throughput/step_actor_idle_s": "throughput/cum_actor_idle_s",
 }
-_DEFAULT_PROVIDER = "openai"
-_OPENAI_PROVIDER = "openai"
-_ANTHROPIC_PROVIDER = "anthropic"
-
-P = ParamSpec("P")
-R = TypeVar("R")
-
-
-CostExtractor = Callable[[Any], float | None]
-ResponseGetter = Callable[[Any], Any]
-
-
-@dataclass(frozen=True)
-class TokenPricing:
-    prompt_per_million: float
-    completion_per_million: float
-
-
-_DEFAULT_TOKEN_PRICING = {
-    _OPENAI_PROVIDER: TokenPricing(prompt_per_million=2.5, completion_per_million=10.0),
-    _ANTHROPIC_PROVIDER: TokenPricing(
-        prompt_per_million=3.0, completion_per_million=15.0
-    ),
-}
 
 
 @dataclass
@@ -64,83 +43,7 @@ def _new_shared_metrics_state() -> _SharedMetricsState:
         unique_scenario_ids=set(),
         pending_scenario_ids=set(),
         cost_extractors={},
-        token_pricing=dict(_DEFAULT_TOKEN_PRICING),
-    )
-
-
-def _normalize_provider(provider: str | None) -> str | None:
-    if provider is None:
-        return None
-    normalized = provider.strip().lower()
-    if not normalized:
-        return None
-    return normalized
-
-
-def _read_usage_field(usage: Any, field: str) -> float | None:
-    if usage is None:
-        return None
-    if isinstance(usage, dict):
-        value = usage.get(field)
-    else:
-        value = getattr(usage, field, None)
-    if value is None:
-        return None
-    return float(value)
-
-
-def _response_usage(response: Any) -> Any:
-    if isinstance(response, dict):
-        return response.get("usage")
-    return getattr(response, "usage", None)
-
-
-def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None:
-    usage = _response_usage(response)
-    prompt_tokens = _read_usage_field(usage, "prompt_tokens")
-    completion_tokens = _read_usage_field(usage, "completion_tokens")
-    if prompt_tokens is None and completion_tokens is None:
-        return None
-    return prompt_tokens or 0.0, completion_tokens or 0.0
-
-
-def _extract_anthropic_token_counts(response: Any) -> tuple[float, float] | None:
-    usage = _response_usage(response)
-    input_tokens = _read_usage_field(usage, "input_tokens")
-    output_tokens = _read_usage_field(usage, "output_tokens")
-    if input_tokens is None and output_tokens is None:
-        return None
-    return input_tokens or 0.0, output_tokens or 0.0
-
-
-def _detect_provider(response: Any) -> str | None:
-    usage = _response_usage(response)
-    if usage is None:
-        return None
-
-    if (
-        _read_usage_field(usage, "prompt_tokens") is not None
-        or _read_usage_field(usage, "completion_tokens") is not None
-    ):
-        return _OPENAI_PROVIDER
-    if (
-        _read_usage_field(usage, "input_tokens") is not None
-        or _read_usage_field(usage, "output_tokens") is not None
-    ):
-        return _ANTHROPIC_PROVIDER
-    return None
-
-
-def _estimate_cost(
-    token_counts: tuple[float, float] | None,
-    pricing: TokenPricing,
-) -> float | None:
-    if token_counts is None:
-        return None
-    prompt_tokens, completion_tokens = token_counts
-    return tokens_to_cost(prompt_tokens, pricing.prompt_per_million) + tokens_to_cost(
-        completion_tokens,
-        pricing.completion_per_million,
+        token_pricing=dict(DEFAULT_TOKEN_PRICING),
     )
 
 
@@ -167,6 +70,33 @@ def add_cost(self, path: str, usd: float) -> None:
         full_key = f"costs/{path}"
         self.add_metric(full_key, float(usd))
 
+    def add_response_cost(
+        self,
+        source: str,
+        response: Any,
+        *,
+        provider: str | None = None,
+        prompt_price_per_million: float | None = None,
+        completion_price_per_million: float | None = None,
+    ) -> float | None:
+        normalized_source = source.strip("/")
+        if not normalized_source:
+            raise ValueError("source must be non-empty")
+
+        cost = extract_api_cost(
+            response,
+            provider=provider,
+            prompt_price_per_million=prompt_price_per_million,
+            completion_price_per_million=completion_price_per_million,
+            cost_extractors=self._shared_state.cost_extractors,
+            token_pricing=self._shared_state.token_pricing,
+        )
+        if cost is None:
+            return None
+
+        self.add_cost(f"{self.cost_context}/{normalized_source}", cost)
+        return cost
+
     def add_metric(self, key: str, value: float) -> None:
         if "/" not in key:
             raise ValueError("Metric key must include a section prefix")
@@ -284,7 +214,7 @@ def for_cost_context(self, cost_context: str) -> "MetricsBuilder":
     def register_cost_extractor(
         self, provider: str, extractor: CostExtractor
     ) -> None:
-        normalized_provider = _normalize_provider(provider)
+        normalized_provider = normalize_provider(provider)
         if normalized_provider is None:
             raise ValueError("provider must be non-empty")
         self._shared_state.cost_extractors[normalized_provider] = extractor
@@ -296,7 +226,7 @@ def register_token_pricing(
         prompt_per_million: float,
         completion_per_million: float,
     ) -> None:
-        normalized_provider = _normalize_provider(provider)
+        normalized_provider = normalize_provider(provider)
         if normalized_provider is None:
             raise ValueError("provider must be non-empty")
         self._shared_state.token_pricing[normalized_provider] = TokenPricing(
@@ -425,145 +355,5 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None:
             ):
                 result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds
 
-    def _resolve_token_pricing(
-        self,
-        provider: str | None,
-        *,
-        prompt_price_per_million: float | None = None,
-        completion_price_per_million: float | None = None,
-    ) -> TokenPricing:
-        normalized_provider = _normalize_provider(provider) or _DEFAULT_PROVIDER
-        default_pricing = self._shared_state.token_pricing.get(
-            normalized_provider,
-            self._shared_state.token_pricing[_DEFAULT_PROVIDER],
-        )
-        return TokenPricing(
-            prompt_per_million=(
-                float(prompt_price_per_million)
-                if prompt_price_per_million is not None
-                else default_pricing.prompt_per_million
-            ),
-            completion_per_million=(
-                float(completion_price_per_million)
-                if completion_price_per_million is not None
-                else default_pricing.completion_per_million
-            ),
-        )
-
-    def _extract_api_cost(
-        self,
-        response: Any,
-        *,
-        provider: str | None = None,
-        prompt_price_per_million: float | None = None,
-        completion_price_per_million: float | None = None,
-    ) -> float | None:
-        provider_name = _normalize_provider(provider) or _detect_provider(response)
-        if provider_name is not None:
-            custom_extractor = self._shared_state.cost_extractors.get(provider_name)
-            if custom_extractor is not None:
-                custom_cost = custom_extractor(response)
-                if custom_cost is not None:
-                    return float(custom_cost)
-
-            token_pricing = self._resolve_token_pricing(
-                provider_name,
-                prompt_price_per_million=prompt_price_per_million,
-                completion_price_per_million=completion_price_per_million,
-            )
-            if provider_name == _OPENAI_PROVIDER:
-                return _estimate_cost(
-                    _extract_openai_token_counts(response),
-                    token_pricing,
-                )
-            if provider_name == _ANTHROPIC_PROVIDER:
-                return _estimate_cost(
-                    _extract_anthropic_token_counts(response),
-                    token_pricing,
-                )
-
-        token_pricing = self._resolve_token_pricing(
-            provider_name,
-            prompt_price_per_million=prompt_price_per_million,
-            completion_price_per_million=completion_price_per_million,
-        )
-        token_counts = _extract_openai_token_counts(response)
-        if token_counts is None:
-            token_counts = _extract_anthropic_token_counts(response)
-        return _estimate_cost(token_counts, token_pricing)
-
-
-def _record_api_cost(
-    *,
-    result: Any,
-    source: str,
-    provider: str | None,
-    response_getter: ResponseGetter | None,
-    prompt_price_per_million: float | None,
-    completion_price_per_million: float | None,
-) -> None:
-    try:
-        builder = MetricsBuilder.get_active()
-    except LookupError:
-        return
-
-    response = response_getter(result) if response_getter is not None else result
-    cost = builder._extract_api_cost(
-        response,
-        provider=provider,
-        prompt_price_per_million=prompt_price_per_million,
-        completion_price_per_million=completion_price_per_million,
-    )
-    if cost is None:
-        return
-    builder.add_cost(f"{builder.cost_context}/{source}", cost)
-
-
-def track_api_cost(
-    *,
-    source: str,
-    provider: str | None = None,
-    response_getter: ResponseGetter | None = None,
-    prompt_price_per_million: float | None = None,
-    completion_price_per_million: float | None = None,
-) -> Callable[[Callable[P, R]], Callable[P, R]]:
-    normalized_source = source.strip("/")
-    if not normalized_source:
-        raise ValueError("source must be non-empty")
-
-    normalized_provider = _normalize_provider(provider)
-
-    def _decorate(func: Callable[P, R]) -> Callable[P, R]:
-        if iscoroutinefunction(func):
-
-            @wraps(func)
-            async def _async_wrapper(*args: P.args, **kwargs: P.kwargs):
-                result = await func(*args, **kwargs)
-                _record_api_cost(
-                    result=result,
-                    source=normalized_source,
-                    provider=normalized_provider,
-                    response_getter=response_getter,
-                    prompt_price_per_million=prompt_price_per_million,
-                    completion_price_per_million=completion_price_per_million,
-                )
-                return result
-
-            return _async_wrapper
-
-        @wraps(func)
-        def _sync_wrapper(*args: P.args, **kwargs: P.kwargs):
-            result = func(*args, **kwargs)
-            _record_api_cost(
-                result=result,
-                source=normalized_source,
-                provider=normalized_provider,
-                response_getter=response_getter,
-                prompt_price_per_million=prompt_price_per_million,
-                completion_price_per_million=completion_price_per_million,
-            )
-            return result
-
-        return _sync_wrapper
 
-    return _decorate
+from .metrics_api_cost import track_api_cost
diff --git a/src/art/metrics_api_cost.py b/src/art/metrics_api_cost.py
new file mode 100644
index 00000000..f9a8d3eb
--- /dev/null
+++ b/src/art/metrics_api_cost.py
@@ -0,0 +1,252 @@
+from __future__ import annotations
+
+from collections.abc import Callable, Mapping
+from dataclasses import dataclass
+from functools import wraps
+from inspect import iscoroutinefunction
+from typing import Any, ParamSpec, TypeVar
+
+from .costs import tokens_to_cost
+
+DEFAULT_PROVIDER = "openai"
+OPENAI_PROVIDER = "openai"
+ANTHROPIC_PROVIDER = "anthropic"
+
+P = ParamSpec("P")
+R = TypeVar("R")
+
+CostExtractor = Callable[[Any], float | None]
+ResponseGetter = Callable[[Any], Any]
+
+
+@dataclass(frozen=True)
+class TokenPricing:
+    prompt_per_million: float
+    completion_per_million: float
+
+
+DEFAULT_TOKEN_PRICING = {
+    OPENAI_PROVIDER: TokenPricing(prompt_per_million=2.5, completion_per_million=10.0),
+    ANTHROPIC_PROVIDER: TokenPricing(
+        prompt_per_million=3.0,
+        completion_per_million=15.0,
+    ),
+}
+
+
+def normalize_provider(provider: str | None) -> str | None:
+    if provider is None:
+        return None
+    normalized = provider.strip().lower()
+    if not normalized:
+        return None
+    return normalized
+
+
+def _read_usage_field(usage: Any, field: str) -> float | None:
+    if usage is None:
+        return None
+    if isinstance(usage, dict):
+        value = usage.get(field)
+    else:
+        value = getattr(usage, field, None)
+    if value is None:
+        return None
+    return float(value)
+
+
+def _response_usage(response: Any) -> Any:
+    if isinstance(response, dict):
+        return response.get("usage")
+    return getattr(response, "usage", None)
+
+
+def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None:
+    usage = _response_usage(response)
+    prompt_tokens = _read_usage_field(usage, "prompt_tokens")
+    completion_tokens = _read_usage_field(usage, "completion_tokens")
+    if prompt_tokens is None and completion_tokens is None:
+        return None
+    return prompt_tokens or 0.0, completion_tokens or 0.0
+
+
+def _extract_anthropic_token_counts(response: Any) -> tuple[float, float] | None:
+    usage = _response_usage(response)
+    input_tokens = _read_usage_field(usage, "input_tokens")
+    output_tokens = _read_usage_field(usage, "output_tokens")
+    if input_tokens is None and output_tokens is None:
+        return None
+    return input_tokens or 0.0, output_tokens or 0.0
+
+
+def _detect_provider(response: Any) -> str | None:
+    usage = _response_usage(response)
+    if usage is None:
+        return None
+
+    if (
+        _read_usage_field(usage, "prompt_tokens") is not None
+        or _read_usage_field(usage, "completion_tokens") is not None
+    ):
+        return OPENAI_PROVIDER
+    if (
+        _read_usage_field(usage, "input_tokens") is not None
+        or _read_usage_field(usage, "output_tokens") is not None
+    ):
+        return ANTHROPIC_PROVIDER
+    return None
+
+
+def _estimate_cost(
+    token_counts: tuple[float, float] | None,
+    pricing: TokenPricing,
+) -> float | None:
+    if token_counts is None:
+        return None
+    prompt_tokens, completion_tokens = token_counts
+    return tokens_to_cost(prompt_tokens, pricing.prompt_per_million) + tokens_to_cost(
+        completion_tokens,
+        pricing.completion_per_million,
+    )
+
+
+def _resolve_token_pricing(
+    provider: str | None,
+    *,
+    prompt_price_per_million: float | None,
+    completion_price_per_million: float | None,
+    token_pricing: Mapping[str, TokenPricing],
+) -> TokenPricing:
+    normalized_provider = normalize_provider(provider) or DEFAULT_PROVIDER
+    default_pricing = token_pricing.get(
+        normalized_provider,
+        token_pricing[DEFAULT_PROVIDER],
+    )
+    return TokenPricing(
+        prompt_per_million=(
+            float(prompt_price_per_million)
+            if prompt_price_per_million is not None
+            else default_pricing.prompt_per_million
+        ),
+        completion_per_million=(
+            float(completion_price_per_million)
+            if completion_price_per_million is not None
+            else default_pricing.completion_per_million
+        ),
+    )
+
+
+def extract_api_cost(
+    response: Any,
+    *,
+    provider: str | None,
+    prompt_price_per_million: float | None,
+    completion_price_per_million: float | None,
+    cost_extractors: Mapping[str, CostExtractor],
+    token_pricing: Mapping[str, TokenPricing],
+) -> float | None:
+    provider_name = normalize_provider(provider) or _detect_provider(response)
+    if provider_name is not None:
+        custom_extractor = cost_extractors.get(provider_name)
+        if custom_extractor is not None:
+            custom_cost = custom_extractor(response)
+            if custom_cost is not None:
+                return float(custom_cost)
+
+        pricing = _resolve_token_pricing(
+            provider_name,
+            prompt_price_per_million=prompt_price_per_million,
+            completion_price_per_million=completion_price_per_million,
+            token_pricing=token_pricing,
+        )
+        if provider_name == OPENAI_PROVIDER:
+            return _estimate_cost(_extract_openai_token_counts(response), pricing)
+        if provider_name == ANTHROPIC_PROVIDER:
+            return _estimate_cost(_extract_anthropic_token_counts(response), pricing)
+
+    pricing = _resolve_token_pricing(
+        provider_name,
+        prompt_price_per_million=prompt_price_per_million,
+        completion_price_per_million=completion_price_per_million,
+        token_pricing=token_pricing,
+    )
+    token_counts = _extract_openai_token_counts(response)
+    if token_counts is None:
+        token_counts = _extract_anthropic_token_counts(response)
+    return _estimate_cost(token_counts, pricing)
+
+
+def _record_api_cost(
+    *,
+    result: Any,
+    source: str,
+    provider: str | None,
+    response_getter: ResponseGetter | None,
+    prompt_price_per_million: float | None,
+    completion_price_per_million: float | None,
+) -> None:
+    try:
+        from .metrics import MetricsBuilder
+
+        builder = MetricsBuilder.get_active()
+    except LookupError:
+        return
+
+    response = response_getter(result) if response_getter is not None else result
+    builder.add_response_cost(
+        source,
+        response,
+        provider=provider,
+        prompt_price_per_million=prompt_price_per_million,
+        completion_price_per_million=completion_price_per_million,
+    )
+
+
+def track_api_cost(
+    *,
+    source: str,
+    provider: str | None = None,
+    response_getter: ResponseGetter | None = None,
+    prompt_price_per_million: float | None = None,
+    completion_price_per_million: float | None = None,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    normalized_source = source.strip("/")
+    if not normalized_source:
+        raise ValueError("source must be non-empty")
+
+    normalized_provider = normalize_provider(provider)
+
+    def _decorate(func: Callable[P, R]) -> Callable[P, R]:
+        if iscoroutinefunction(func):
+
+            @wraps(func)
+            async def _async_wrapper(*args: P.args, **kwargs: P.kwargs):
+                result = await func(*args, **kwargs)
+                _record_api_cost(
+                    result=result,
+                    source=normalized_source,
+                    provider=normalized_provider,
+                    response_getter=response_getter,
+                    prompt_price_per_million=prompt_price_per_million,
+                    completion_price_per_million=completion_price_per_million,
+                )
+                return result
+
+            return _async_wrapper
+
+        @wraps(func)
+        def _sync_wrapper(*args: P.args, **kwargs: P.kwargs):
+            result = func(*args, **kwargs)
+            _record_api_cost(
+                result=result,
+                source=normalized_source,
+                provider=normalized_provider,
+                response_getter=response_getter,
+                prompt_price_per_million=prompt_price_per_million,
+                completion_price_per_million=completion_price_per_million,
+            )
+            return result
+
+        return _sync_wrapper
+
+    return _decorate
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
index 1ef7cbae..cef84184 100644
--- a/tests/unit/test_metrics_builder.py
+++ b/tests/unit/test_metrics_builder.py
@@ -182,6 +182,24 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None:
         assert metrics["costs/eval/judge"] == pytest.approx(2.0)
         assert metrics["costs/all_cum"] == pytest.approx(3.0)
 
+    @pytest.mark.asyncio
+    async def test_add_response_cost_uses_registered_pricing(self) -> None:
+        builder = MetricsBuilder(cost_context="eval")
+        builder.register_token_pricing(
+            "anthropic",
+            prompt_per_million=5.0,
+            completion_per_million=7.0,
+        )
+
+        cost = builder.add_response_cost(
+            "llm_judge/faithfulness",
+            {"usage": {"input_tokens": 40, "output_tokens": 60}},
+        )
+
+        metrics = await builder.flush()
+        assert cost == pytest.approx(0.00062)
+        assert metrics["costs/eval/llm_judge/faithfulness"] == pytest.approx(0.00062)
+
     @pytest.mark.asyncio
     async def test_unique_scenario_count_tracks_exact_ids(self) -> None:
         builder = MetricsBuilder(cost_context="train")

From 29b7836e5062e1c885b2bbb992a91978b4b11b6b Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 15:27:36 -0700
Subject: [PATCH 28/46] fix: Simplify Metrics Logging And Cumulative Naming

---
 docs/metrics-taxonomy.md            |  8 ++---
 src/art/metrics.py                  | 39 ++++++++++++++++------
 src/art/metrics_taxonomy.py         | 24 ++++++++++++--
 src/art/model.py                    | 40 +++++++++++------------
 tests/unit/test_frontend_logging.py | 44 ++++++++++++-------------
 tests/unit/test_metric_routing.py   |  4 +--
 tests/unit/test_metrics_builder.py  | 50 ++++++++++++++---------------
 tests/unit/test_metrics_taxonomy.py | 11 +++++++
 tests/unit/test_track_api_cost.py   |  2 +-
 9 files changed, 135 insertions(+), 87 deletions(-)

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
index 300bdf0b..38c12d1d 100644
--- a/docs/metrics-taxonomy.md
+++ b/docs/metrics-taxonomy.md
@@ -45,7 +45,7 @@ Cost leaves can be logged with either:
 ART rolls costs up automatically:
 
 - parent rollups (for example `costs/train`, `costs/all`)
-- cumulative keys with `_cum` suffix (for example `costs/all_cum`)
+- cumulative keys under the `cum/` namespace (for example `costs/cum/all`)
 
 ## Metrics Added By ART
 
@@ -58,9 +58,9 @@ ART now emits the following metrics from library internals where the data is ava
 - `time/step_wall_s`, `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer`
 - `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted`
 - `data/step_num_groups_trainable` for train splits
-- `data/cum_num_unique_scenarios` when scenario IDs are present in group or trajectory metadata
+- `data/cum/num_unique_scenarios` when scenario IDs are present in group or trajectory metadata
 - `data/step_trainer_tokens` where the backend knows the trainer token count
-- `throughput/cum_trainer_idle_s`, `throughput/cum_actor_idle_s`
+- `throughput/cum/trainer_idle_s`, `throughput/cum/actor_idle_s`
 - `throughput/avg_trainer_tok_per_s`, `throughput/avg_actor_tok_per_s` when both token and time inputs are available
 
 Some metrics remain user-owned because ART cannot infer them reliably for every workflow, especially actor token usage outside the pipeline trainer.
@@ -135,7 +135,7 @@ The next `model.log(...)` flush for that step will include:
 
 - `costs/train/llm_judge/correctness` (or `costs/eval/...`)
 - hierarchical rollups like `costs/train`, `costs/all`
-- cumulative keys like `costs/all_cum`
+- cumulative keys like `costs/cum/all`
 
 Built-in providers:
 
diff --git a/src/art/metrics.py b/src/art/metrics.py
index bb23f510..0e9846d8 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -19,11 +19,30 @@
 
 _HIERARCHICAL_SECTIONS = {"costs", "time", "data"}
 _THROUGHPUT_IDLE_MAPPINGS = {
-    "throughput/step_trainer_idle_s": "throughput/cum_trainer_idle_s",
-    "throughput/step_actor_idle_s": "throughput/cum_actor_idle_s",
+    "throughput/step_trainer_idle_s": "throughput/cum/trainer_idle_s",
+    "throughput/step_actor_idle_s": "throughput/cum/actor_idle_s",
 }
 
 
+def is_cumulative_metric_key(key: str) -> bool:
+    parts = key.split("/", 2)
+    return len(parts) >= 2 and parts[1] == "cum"
+
+
+def is_builder_managed_metric(key: str) -> bool:
+    return key.startswith(("costs/", "time/step_", "data/step_", "throughput/step_"))
+
+
+def to_cumulative_metric_key(key: str) -> str:
+    if is_cumulative_metric_key(key):
+        raise ValueError(f"Metric key '{key}' is already cumulative.")
+
+    section, rest = key.split("/", 1)
+    if rest.startswith("step_"):
+        rest = rest[len("step_") :]
+    return f"{section}/cum/{rest}"
+
+
 @dataclass
 class _SharedMetricsState:
     lock: asyncio.Lock
@@ -167,7 +186,7 @@ async def flush(self) -> dict[str, float]:
                 section = key.split("/", 1)[0]
                 if section not in _HIERARCHICAL_SECTIONS:
                     continue
-                cum_key = f"{key}_cum"
+                cum_key = to_cumulative_metric_key(key)
                 next_value = self._shared_state.cum_state.get(cum_key, 0.0) + value
                 self._shared_state.cum_state[cum_key] = next_value
                 result[cum_key] = next_value
@@ -176,7 +195,7 @@ async def flush(self) -> dict[str, float]:
                 self._shared_state.unique_scenario_ids.update(
                     self._shared_state.pending_scenario_ids
                 )
-                result["data/cum_num_unique_scenarios"] = float(
+                result["data/cum/num_unique_scenarios"] = float(
                     len(self._shared_state.unique_scenario_ids)
                 )
 
@@ -253,9 +272,9 @@ def load_state_dict(self, state: dict[str, Any]) -> None:
         self._shared_state.pending_scenario_ids.clear()
 
     def _validate_and_add(self, key: str, value: float) -> None:
-        if key.endswith("_cum"):
+        if is_cumulative_metric_key(key):
             raise ValueError(
-                f"Metric key '{key}' ends with '_cum', which is reserved for cumulative metrics."
+                f"Metric key '{key}' uses the reserved cumulative namespace."
             )
 
         for existing_key in self._shared_state.step_buffer:
@@ -329,10 +348,10 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None:
             or "time/step_trainer_s" in result
         ):
             trainer_tokens = self._shared_state.cum_state.get(
-                "data/step_trainer_tokens_cum"
+                "data/cum/trainer_tokens"
             )
             trainer_seconds = self._shared_state.cum_state.get(
-                "time/step_trainer_s_cum"
+                "time/cum/trainer_s"
             )
             if (
                 trainer_tokens is not None
@@ -345,9 +364,9 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None:
 
         if "data/step_actor_tokens" in result or "time/step_actor_s" in result:
             actor_tokens = self._shared_state.cum_state.get(
-                "data/step_actor_tokens_cum"
+                "data/cum/actor_tokens"
             )
-            actor_seconds = self._shared_state.cum_state.get("time/step_actor_s_cum")
+            actor_seconds = self._shared_state.cum_state.get("time/cum/actor_s")
             if (
                 actor_tokens is not None
                 and actor_seconds is not None
diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py
index 061e0872..e4f9e713 100644
--- a/src/art/metrics_taxonomy.py
+++ b/src/art/metrics_taxonomy.py
@@ -33,6 +33,7 @@
     "train_tokens": "data/step_trainer_tokens",
     "num_datums": "data/step_num_datums",
 }
+_INVARIANT_METRIC_KEYS = frozenset({TRAIN_GRADIENT_STEPS_KEY})
 
 
 def rename_train_metric_key(metric: str) -> str:
@@ -48,13 +49,32 @@ def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]:
 def average_metric_samples(metric_samples: Iterable[dict[str, float]]) -> dict[str, float]:
     totals: dict[str, float] = {}
     counts: dict[str, int] = {}
+    invariant_values: dict[str, float] = {}
 
     for sample in metric_samples:
         for key, value in sample.items():
-            totals[key] = totals.get(key, 0.0) + float(value)
+            numeric_value = float(value)
+            if key in _INVARIANT_METRIC_KEYS:
+                previous_value = invariant_values.get(key)
+                if previous_value is None:
+                    invariant_values[key] = numeric_value
+                elif previous_value != numeric_value:
+                    raise ValueError(
+                        f"Metric '{key}' must be invariant across samples, "
+                        f"got {previous_value} and {numeric_value}."
+                    )
+
+            totals[key] = totals.get(key, 0.0) + numeric_value
             counts[key] = counts.get(key, 0) + 1
 
-    return {key: totals[key] / counts[key] for key in totals}
+    return {
+        key: (
+            invariant_values[key]
+            if key in _INVARIANT_METRIC_KEYS
+            else totals[key] / counts[key]
+        )
+        for key in totals
+    }
 
 
 @dataclass(frozen=True)
diff --git a/src/art/model.py b/src/art/model.py
index ff0d75e3..aba894a0 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -15,7 +15,7 @@
 
 from . import dev
 from .costs import CostCalculator
-from .metrics import MetricsBuilder
+from .metrics import MetricsBuilder, is_builder_managed_metric
 from .metrics_taxonomy import (
     TRAIN_GRADIENT_STEPS_KEY,
     average_metric_samples,
@@ -39,7 +39,6 @@
 COSTS_METRIC_PREFIX = "costs_"
 COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total"
 METRICS_BUILDER_STATE_KEY = "_metrics_builder_state"
-BUILDER_CUMULATIVE_PREFIXES = ("time/step_", "data/step_", "throughput/step_")
 METRIC_SECTIONS = frozenset(
     {
         "reward",
@@ -493,7 +492,7 @@ def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None:
             wandb.define_metric(key, step_metric="training_step")
             self._wandb_defined_metrics.add(key)
 
-    def _extract_non_cost_metrics(
+    def _route_metrics_and_extract_non_costs(
         self, metrics: dict[str, float], split: str
     ) -> dict[str, float]:
         non_cost_metrics: dict[str, float] = {}
@@ -515,7 +514,7 @@ def _extract_non_cost_metrics(
                         f"{cost_context}/{component}", numeric_value
                     )
                 continue
-            if metric.startswith(BUILDER_CUMULATIVE_PREFIXES):
+            if is_builder_managed_metric(metric):
                 self._metrics_builder.add_metric(metric, numeric_value)
                 continue
             non_cost_metrics[metric] = numeric_value
@@ -633,12 +632,13 @@ async def log(
         # If only metrics provided (no trajectories), just log them and return
         if trajectories is None:
             if metrics is not None:
-                metrics_without_costs = self._extract_non_cost_metrics(metrics, split)
-                if metrics_without_costs:
-                    self._log_metrics(metrics_without_costs, split, step)
-                costs = await self._metrics_builder.flush()
-                if costs:
-                    self._log_metrics(costs, split, step)
+                metrics_without_costs = self._route_metrics_and_extract_non_costs(
+                    metrics, split
+                )
+                builder_metrics = await self._metrics_builder.flush()
+                merged_metrics = {**metrics_without_costs, **builder_metrics}
+                if merged_metrics:
+                    self._log_metrics(merged_metrics, split, step)
                 self._persist_metrics_builder_state()
             return
 
@@ -676,7 +676,7 @@ async def log(
 
         for group in trajectory_groups:
             if group.metrics:
-                group_non_cost = self._extract_non_cost_metrics(
+                group_non_cost = self._route_metrics_and_extract_non_costs(
                     cast(dict[str, float], group.metrics), split
                 )
             else:
@@ -701,7 +701,7 @@ async def log(
                         routed_metric = f"reward/{routed_metric}"
                     trajectory_metrics[routed_metric] = float(value)
 
-                non_cost_trajectory_metrics = self._extract_non_cost_metrics(
+                non_cost_trajectory_metrics = self._route_metrics_and_extract_non_costs(
                     trajectory_metrics,
                     split,
                 )
@@ -735,16 +735,16 @@ async def log(
 
         # Merge in any additional metrics passed directly
         if metrics is not None:
-            metrics_without_costs = self._extract_non_cost_metrics(metrics, split)
+            metrics_without_costs = self._route_metrics_and_extract_non_costs(
+                metrics, split
+            )
             averages.update(metrics_without_costs)
 
-        # 3. Log metrics (writes to history.jsonl and wandb)
-        self._log_metrics(averages, split, step)
-
-        # 4. Log cumulative costs
-        costs = await self._metrics_builder.flush()
-        if costs:
-            self._log_metrics(costs, split, step)
+        # 3. Merge in any builder-managed metrics and log a single row.
+        builder_metrics = await self._metrics_builder.flush()
+        merged_metrics = {**averages, **builder_metrics}
+        if merged_metrics:
+            self._log_metrics(merged_metrics, split, step)
         self._persist_metrics_builder_state()
 
     async def get_step(self) -> int:
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 7f689c42..755d7e64 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -227,9 +227,7 @@ async def test_history_appends_entries(
         history_path = tmp_path / "test-project/models/test-model/history.jsonl"
         df = pl.read_ndjson(str(history_path))
 
-        # Each log call now emits the primary metrics row plus a taxonomy
-        # row for cumulative data/time metrics.
-        assert len(df) == 4
+        assert len(df) == 2
 
         # Check both splits are present
         columns = df.columns
@@ -351,9 +349,9 @@ async def test_metric_prefixes(self, tmp_path: Path):
                 "time/wall_clock_sec",
             ]
         ]
-        assert all(k.startswith("val/") for k in metric_keys), (
-            f"Not all metrics prefixed: {metric_keys}"
-        )
+        assert all(
+            k.startswith(("val/", "data/")) for k in metric_keys
+        ), f"Not all metrics routed into taxonomy namespaces: {metric_keys}"
         assert entry["training_step"] == 0
         assert entry["time/wall_clock_sec"] >= 0
 
@@ -619,7 +617,7 @@ async def test_train_logs_add_default_data_metrics_from_trajectory_groups(
         assert merged["data/step_num_trajectories"] == pytest.approx(3.0)
         assert merged["data/step_num_groups_submitted"] == pytest.approx(2.0)
         assert merged["data/step_num_groups_trainable"] == pytest.approx(1.0)
-        assert merged["data/cum_num_unique_scenarios"] == pytest.approx(2.0)
+        assert merged["data/cum/num_unique_scenarios"] == pytest.approx(2.0)
         assert merged["train/num_groups_submitted"] == pytest.approx(2.0)
         assert merged["train/num_groups_trainable"] == pytest.approx(1.0)
         assert merged["train/num_trajectories"] == pytest.approx(3.0)
@@ -660,12 +658,12 @@ async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path):
         assert first["costs/train/sample"] == pytest.approx(0.3)
         assert first["costs/train"] == pytest.approx(0.5)
         assert first["costs/all"] == pytest.approx(0.5)
-        assert first["costs/all_cum"] == pytest.approx(0.5)
+        assert first["costs/cum/all"] == pytest.approx(0.5)
 
         assert second["costs/train/prefill"] == pytest.approx(0.1)
-        assert second["costs/train/prefill_cum"] == pytest.approx(0.3)
-        assert second["costs/train_cum"] == pytest.approx(0.6)
-        assert second["costs/all_cum"] == pytest.approx(0.6)
+        assert second["costs/cum/train/prefill"] == pytest.approx(0.3)
+        assert second["costs/cum/train"] == pytest.approx(0.6)
+        assert second["costs/cum/all"] == pytest.approx(0.6)
 
     @pytest.mark.asyncio
     async def test_cost_cumulative_persists_across_model_recreation(
@@ -702,9 +700,9 @@ async def test_cost_cumulative_persists_across_model_recreation(
             first = json.loads(f.readline())
             second = json.loads(f.readline())
 
-        assert first["costs/train/prefill_cum"] == pytest.approx(0.25)
-        assert second["costs/train/prefill_cum"] == pytest.approx(1.0)
-        assert second["costs/all_cum"] == pytest.approx(1.0)
+        assert first["costs/cum/train/prefill"] == pytest.approx(0.25)
+        assert second["costs/cum/train/prefill"] == pytest.approx(1.0)
+        assert second["costs/cum/all"] == pytest.approx(1.0)
 
     @pytest.mark.asyncio
     async def test_metrics_builder_loads_resume_state_before_builder_use(
@@ -733,8 +731,8 @@ async def test_metrics_builder_loads_resume_state_before_builder_use(
             first = json.loads(f.readline())
             second = json.loads(f.readline())
 
-        assert first["data/cum_num_unique_scenarios"] == pytest.approx(1.0)
-        assert second["data/cum_num_unique_scenarios"] == pytest.approx(2.0)
+        assert first["data/cum/num_unique_scenarios"] == pytest.approx(1.0)
+        assert second["data/cum/num_unique_scenarios"] == pytest.approx(2.0)
 
     @pytest.mark.asyncio
     async def test_direct_time_and_data_metrics_get_cumulative_variants(
@@ -762,9 +760,9 @@ async def test_direct_time_and_data_metrics_get_cumulative_variants(
             entry = json.loads(f.readline())
 
         assert entry["time/step_actor_s"] == pytest.approx(1.5)
-        assert entry["time/step_actor_s_cum"] == pytest.approx(1.5)
+        assert entry["time/cum/actor_s"] == pytest.approx(1.5)
         assert entry["data/step_actor_tokens"] == pytest.approx(10)
-        assert entry["data/step_actor_tokens_cum"] == pytest.approx(10)
+        assert entry["data/cum/actor_tokens"] == pytest.approx(10)
 
     @pytest.mark.asyncio
     async def test_log_without_new_builder_metrics_skips_extra_taxonomy_row(
@@ -799,10 +797,10 @@ async def test_log_without_new_builder_metrics_skips_extra_taxonomy_row(
 
         assert len(rows) == 2
         assert rows[0]["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
-        assert rows[0]["data/cum_num_unique_scenarios"] == pytest.approx(1.0)
+        assert rows[0]["data/cum/num_unique_scenarios"] == pytest.approx(1.0)
         assert rows[1]["loss/train"] == pytest.approx(1.0)
         assert "throughput/avg_trainer_tok_per_s" not in rows[1]
-        assert "data/cum_num_unique_scenarios" not in rows[1]
+        assert "data/cum/num_unique_scenarios" not in rows[1]
 
 
 class TestWandbIntegration:
@@ -984,7 +982,7 @@ async def mock_train_sft(*args, **kwargs):
         with open(history_path) as f:
             lines = f.readlines()
 
-        assert len(lines) == 2, f"Expected 2 log entries, got {len(lines)}"
+        assert len(lines) == 1, f"Expected 1 log entry, got {len(lines)}"
 
         entries = [json.loads(line) for line in lines]
         merged: dict[str, float] = {}
@@ -995,7 +993,7 @@ async def mock_train_sft(*args, **kwargs):
         assert merged["loss/train"] == pytest.approx(0.8)  # (1.0 + 0.8 + 0.6) / 3
         assert merged["loss/grad_norm"] == pytest.approx(0.4)  # (0.5 + 0.4 + 0.3) / 3
         assert merged["time/step_trainer_s"] >= 0
-        assert merged["time/step_trainer_s_cum"] >= 0
+        assert merged["time/cum/trainer_s"] >= 0
 
     @pytest.mark.asyncio
     async def test_train_sft_single_step_increment(self, tmp_path: Path):
@@ -1032,7 +1030,7 @@ async def mock_train_sft(*args, **kwargs):
         history_path = tmp_path / "test-project/models/test-sft-step/history.jsonl"
         df = pl.read_ndjson(str(history_path))
 
-        assert len(df) == 2, "Should have exactly 2 log entries"
+        assert len(df) == 1, "Should have exactly 1 log entry"
         assert set(df["step"].to_list()) == {1}, "Step should be 1 (single increment)"
 
     @pytest.mark.asyncio
diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py
index 2587385d..d7dcd2b5 100644
--- a/tests/unit/test_metric_routing.py
+++ b/tests/unit/test_metric_routing.py
@@ -97,7 +97,7 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step(
                 model._log_metrics(
                     {
                         "costs/train/sample": 0.1,
-                        "costs/train/prefill_cum": 0.2,
+                        "costs/cum/train/prefill": 0.2,
                     },
                     split="train",
                     step=1,
@@ -109,6 +109,6 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step(
         ]
         assert (("costs/train/sample",), {"step_metric": "training_step"}) in define_calls
         assert (
-            (("costs/train/prefill_cum",), {"step_metric": "training_step"})
+            (("costs/cum/train/prefill",), {"step_metric": "training_step"})
             in define_calls
         )
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
index cef84184..d1df45e8 100644
--- a/tests/unit/test_metrics_builder.py
+++ b/tests/unit/test_metrics_builder.py
@@ -21,9 +21,9 @@ async def test_rollup_correctness_across_depths(self) -> None:
         assert metrics["costs/train"] == pytest.approx(1.77)
         assert metrics["costs/eval"] == pytest.approx(0.06)
         assert metrics["costs/all"] == pytest.approx(1.83)
-        assert metrics["costs/train/llm_judge_cum"] == pytest.approx(0.12)
-        assert metrics["costs/train_cum"] == pytest.approx(1.77)
-        assert metrics["costs/all_cum"] == pytest.approx(1.83)
+        assert metrics["costs/cum/train/llm_judge"] == pytest.approx(0.12)
+        assert metrics["costs/cum/train"] == pytest.approx(1.77)
+        assert metrics["costs/cum/all"] == pytest.approx(1.83)
 
     @pytest.mark.asyncio
     async def test_cum_accumulates_for_hierarchical_sections(self) -> None:
@@ -37,11 +37,11 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None:
         )
         first = await builder.flush()
 
-        assert first["time/step_wall_s_cum"] == pytest.approx(1.5)
-        assert first["time/step_actor_s_cum"] == pytest.approx(0.3)
-        assert first["data/step_num_scenarios_cum"] == pytest.approx(2)
-        assert first["data/step_actor_tokens_cum"] == pytest.approx(10)
-        assert first["data/cum_num_unique_scenarios"] == 2
+        assert first["time/cum/wall_s"] == pytest.approx(1.5)
+        assert first["time/cum/actor_s"] == pytest.approx(0.3)
+        assert first["data/cum/num_scenarios"] == pytest.approx(2)
+        assert first["data/cum/actor_tokens"] == pytest.approx(10)
+        assert first["data/cum/num_unique_scenarios"] == 2
 
         builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2)
         builder.add_data(
@@ -51,11 +51,11 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None:
         )
         second = await builder.flush()
 
-        assert second["time/step_wall_s_cum"] == pytest.approx(2.0)
-        assert second["time/step_actor_s_cum"] == pytest.approx(0.5)
-        assert second["data/step_num_scenarios_cum"] == pytest.approx(5)
-        assert second["data/step_actor_tokens_cum"] == pytest.approx(15)
-        assert second["data/cum_num_unique_scenarios"] == 3
+        assert second["time/cum/wall_s"] == pytest.approx(2.0)
+        assert second["time/cum/actor_s"] == pytest.approx(0.5)
+        assert second["data/cum/num_scenarios"] == pytest.approx(5)
+        assert second["data/cum/actor_tokens"] == pytest.approx(15)
+        assert second["data/cum/num_unique_scenarios"] == 3
 
     @pytest.mark.asyncio
     async def test_helper_metrics_accumulate_within_a_single_step(self) -> None:
@@ -90,8 +90,8 @@ async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) -
 
         metrics = await builder.flush()
 
-        assert metrics["throughput/cum_trainer_idle_s"] == pytest.approx(1.5)
-        assert metrics["throughput/cum_actor_idle_s"] == pytest.approx(0.5)
+        assert metrics["throughput/cum/trainer_idle_s"] == pytest.approx(1.5)
+        assert metrics["throughput/cum/actor_idle_s"] == pytest.approx(0.5)
         assert metrics["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
         assert metrics["throughput/avg_actor_tok_per_s"] == pytest.approx(5.0)
 
@@ -131,17 +131,17 @@ async def test_duplicate_leaf_writes_are_summed(self) -> None:
         assert metrics["costs/train"] == pytest.approx(2.0)
         assert metrics["costs/all"] == pytest.approx(2.0)
 
-    def test_cum_suffix_is_reserved(self) -> None:
+    def test_cumulative_namespace_is_reserved(self) -> None:
         builder = MetricsBuilder(cost_context="train")
         with pytest.raises(ValueError):
-            builder.add_cost("train/llm_judge_cum", usd=0.1)
+            builder.add_metric("costs/cum/train/llm_judge", 0.1)
 
     @pytest.mark.asyncio
     async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None:
         builder = MetricsBuilder(cost_context="train")
         builder.add_cost("train/gpu", usd=1.0)
         first = await builder.flush()
-        assert first["costs/train_cum"] == pytest.approx(1.0)
+        assert first["costs/cum/train"] == pytest.approx(1.0)
 
         second = await builder.flush()
         assert not any(key.startswith("costs/") for key in second)
@@ -149,7 +149,7 @@ async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None:
         builder.add_cost("train/gpu", usd=2.0)
         third = await builder.flush()
         assert third["costs/train"] == pytest.approx(2.0)
-        assert third["costs/train_cum"] == pytest.approx(3.0)
+        assert third["costs/cum/train"] == pytest.approx(3.0)
 
     @pytest.mark.asyncio
     async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None:
@@ -163,8 +163,8 @@ async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None:
         after.add_cost("train/gpu", usd=2.0)
 
         metrics = await after.flush()
-        assert metrics["costs/train_cum"] == pytest.approx(3.0)
-        assert metrics["costs/all_cum"] == pytest.approx(3.0)
+        assert metrics["costs/cum/train"] == pytest.approx(3.0)
+        assert metrics["costs/cum/all"] == pytest.approx(3.0)
 
     @pytest.mark.asyncio
     async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None:
@@ -180,7 +180,7 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None:
 
         metrics = await eval_builder.flush()
         assert metrics["costs/eval/judge"] == pytest.approx(2.0)
-        assert metrics["costs/all_cum"] == pytest.approx(3.0)
+        assert metrics["costs/cum/all"] == pytest.approx(3.0)
 
     @pytest.mark.asyncio
     async def test_add_response_cost_uses_registered_pricing(self) -> None:
@@ -205,11 +205,11 @@ async def test_unique_scenario_count_tracks_exact_ids(self) -> None:
         builder = MetricsBuilder(cost_context="train")
         builder.add_data(scenario_ids=["s1", "s2", "s3"])
         first = await builder.flush()
-        assert first["data/cum_num_unique_scenarios"] == 3
+        assert first["data/cum/num_unique_scenarios"] == 3
 
         builder.add_data(scenario_ids=["s2", "s4"])
         second = await builder.flush()
-        assert second["data/cum_num_unique_scenarios"] == 4
+        assert second["data/cum/num_unique_scenarios"] == 4
 
     @pytest.mark.asyncio
     async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None:
@@ -220,7 +220,7 @@ async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None:
 
         first = await builder.flush()
         assert first["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
-        assert first["data/cum_num_unique_scenarios"] == 1
+        assert first["data/cum/num_unique_scenarios"] == 1
 
         second = await builder.flush()
         assert second == {}
diff --git a/tests/unit/test_metrics_taxonomy.py b/tests/unit/test_metrics_taxonomy.py
index 45085476..7d22ddf2 100644
--- a/tests/unit/test_metrics_taxonomy.py
+++ b/tests/unit/test_metrics_taxonomy.py
@@ -1,6 +1,7 @@
 import pytest
 
 from art.metrics_taxonomy import (
+    TRAIN_GRADIENT_STEPS_KEY,
     TrajectoryBatchSummary,
     average_metric_samples,
     build_training_summary_metrics,
@@ -38,3 +39,13 @@ def test_build_training_summary_metrics_includes_data_and_train_sections() -> No
     assert metrics["data/step_num_groups_trainable"] == pytest.approx(1.0)
     assert metrics["train/num_groups_submitted"] == pytest.approx(2.0)
     assert metrics["train/num_trajectories"] == pytest.approx(5.0)
+
+
+def test_average_metric_samples_requires_invariant_gradient_step_count() -> None:
+    with pytest.raises(ValueError, match="must be invariant"):
+        average_metric_samples(
+            [
+                {TRAIN_GRADIENT_STEPS_KEY: 2.0},
+                {TRAIN_GRADIENT_STEPS_KEY: 3.0},
+            ]
+        )
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
index 897a57d2..b8ac6292 100644
--- a/tests/unit/test_track_api_cost.py
+++ b/tests/unit/test_track_api_cost.py
@@ -190,7 +190,7 @@ async def _eval_judge() -> _AnthropicResponse:
 
         assert first["costs/train/llm_judge/correctness"] == pytest.approx(0.0002)
         assert second["costs/eval/llm_judge/factuality"] == pytest.approx(0.00016)
-        assert second["costs/all_cum"] == pytest.approx(0.00036)
+        assert second["costs/cum/all"] == pytest.approx(0.00036)
 
     @pytest.mark.asyncio
     async def test_pipeline_trainer_activates_train_context_for_rollouts(

From 8c2042c1cb939db20844ffa7bb8b400e5447e72d Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 15:31:02 -0700
Subject: [PATCH 29/46] fix: Require Model-Aware Api Cost Pricing

---
 docs/metrics-taxonomy.md           |  20 ++--
 examples/metrics_taxonomy_smoke.py |   3 +-
 src/art/metrics.py                 |  24 +++--
 src/art/metrics_api_cost.py        | 146 +++++++++++++++++++++++------
 tests/unit/test_metrics_builder.py |  11 ++-
 tests/unit/test_track_api_cost.py  |  44 +++++++--
 6 files changed, 191 insertions(+), 57 deletions(-)

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
index 38c12d1d..bc4f7980 100644
--- a/docs/metrics-taxonomy.md
+++ b/docs/metrics-taxonomy.md
@@ -105,12 +105,11 @@ from art.metrics import track_api_cost
 @track_api_cost(
     source="llm_judge/correctness",
     provider="openai",
-    prompt_price_per_million=1.0,
-    completion_price_per_million=2.0,
+    model_name="openai/gpt-oss-20b",
 )
 async def run_judge(client, messages):
     return await client.chat.completions.create(
-        model="gpt-4o-mini",
+        model="gpt-oss-20b",
         messages=messages,
     )
 ```
@@ -137,15 +136,24 @@ The next `model.log(...)` flush for that step will include:
 - hierarchical rollups like `costs/train`, `costs/all`
 - cumulative keys like `costs/cum/all`
 
-Built-in providers:
+Built-in usage extraction:
 
 - OpenAI usage (`prompt_tokens`, `completion_tokens`)
 - Anthropic usage (`input_tokens`, `output_tokens`)
 
-You can override pricing per decorator call or configure builder-level defaults:
+Pricing is model-aware by default. ART will use the configured model pricing from
+`art.costs.MODEL_PRICING` when it can resolve a concrete model name, and it
+raises instead of guessing when pricing is missing.
+
+You can still override pricing per decorator call or register model-specific
+pricing on the builder:
 
 ```python
 builder = model.metrics_builder()
-builder.register_token_pricing("openai", prompt_per_million=1.2, completion_per_million=4.8)
+builder.register_model_pricing(
+    "anthropic/my-custom-judge",
+    prompt_per_million=1.2,
+    completion_per_million=4.8,
+)
 builder.register_cost_extractor("openai", lambda response: 0.001)  # optional custom extractor
 ```
diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py
index 4f2c4a2f..ff4d4afe 100644
--- a/examples/metrics_taxonomy_smoke.py
+++ b/examples/metrics_taxonomy_smoke.py
@@ -25,8 +25,7 @@ def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
 @track_api_cost(
     source="llm_judge/decorator_demo",
     provider="openai",
-    prompt_price_per_million=1.0,
-    completion_price_per_million=2.0,
+    model_name="openai/gpt-oss-20b",
 )
 async def _mock_judge_call(step: int) -> _Response:
     return _Response(
diff --git a/src/art/metrics.py b/src/art/metrics.py
index 0e9846d8..29946316 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -8,8 +8,8 @@
 from typing import Any
 
 from .metrics_api_cost import (
-    DEFAULT_TOKEN_PRICING,
     CostExtractor,
+    ModelNameGetter,
     TokenPricing,
     extract_api_cost,
     normalize_provider,
@@ -51,7 +51,7 @@ class _SharedMetricsState:
     unique_scenario_ids: set[str]
     pending_scenario_ids: set[str]
     cost_extractors: dict[str, CostExtractor]
-    token_pricing: dict[str, TokenPricing]
+    model_pricing: dict[str, TokenPricing]
 
 
 def _new_shared_metrics_state() -> _SharedMetricsState:
@@ -62,7 +62,7 @@ def _new_shared_metrics_state() -> _SharedMetricsState:
         unique_scenario_ids=set(),
         pending_scenario_ids=set(),
         cost_extractors={},
-        token_pricing=dict(DEFAULT_TOKEN_PRICING),
+        model_pricing={},
     )
 
 
@@ -95,6 +95,8 @@ def add_response_cost(
         response: Any,
         *,
         provider: str | None = None,
+        model_name: str | None = None,
+        model_name_getter: "ModelNameGetter | None" = None,
         prompt_price_per_million: float | None = None,
         completion_price_per_million: float | None = None,
     ) -> float | None:
@@ -105,10 +107,12 @@ def add_response_cost(
         cost = extract_api_cost(
             response,
             provider=provider,
+            model_name=model_name,
+            model_name_getter=model_name_getter,
             prompt_price_per_million=prompt_price_per_million,
             completion_price_per_million=completion_price_per_million,
             cost_extractors=self._shared_state.cost_extractors,
-            token_pricing=self._shared_state.token_pricing,
+            model_pricing=self._shared_state.model_pricing,
         )
         if cost is None:
             return None
@@ -238,17 +242,17 @@ def register_cost_extractor(
             raise ValueError("provider must be non-empty")
         self._shared_state.cost_extractors[normalized_provider] = extractor
 
-    def register_token_pricing(
+    def register_model_pricing(
         self,
-        provider: str,
+        model_name: str,
         *,
         prompt_per_million: float,
         completion_per_million: float,
     ) -> None:
-        normalized_provider = normalize_provider(provider)
-        if normalized_provider is None:
-            raise ValueError("provider must be non-empty")
-        self._shared_state.token_pricing[normalized_provider] = TokenPricing(
+        normalized_model_name = model_name.strip()
+        if not normalized_model_name:
+            raise ValueError("model_name must be non-empty")
+        self._shared_state.model_pricing[normalized_model_name] = TokenPricing(
             prompt_per_million=float(prompt_per_million),
             completion_per_million=float(completion_per_million),
         )
diff --git a/src/art/metrics_api_cost.py b/src/art/metrics_api_cost.py
index f9a8d3eb..6d713192 100644
--- a/src/art/metrics_api_cost.py
+++ b/src/art/metrics_api_cost.py
@@ -6,9 +6,8 @@
 from inspect import iscoroutinefunction
 from typing import Any, ParamSpec, TypeVar
 
-from .costs import tokens_to_cost
+from .costs import get_model_pricing, tokens_to_cost
 
-DEFAULT_PROVIDER = "openai"
 OPENAI_PROVIDER = "openai"
 ANTHROPIC_PROVIDER = "anthropic"
 
@@ -16,6 +15,7 @@
 R = TypeVar("R")
 
 CostExtractor = Callable[[Any], float | None]
+ModelNameGetter = Callable[[Any], str | None]
 ResponseGetter = Callable[[Any], Any]
 
 
@@ -24,16 +24,6 @@ class TokenPricing:
     prompt_per_million: float
     completion_per_million: float
 
-
-DEFAULT_TOKEN_PRICING = {
-    OPENAI_PROVIDER: TokenPricing(prompt_per_million=2.5, completion_per_million=10.0),
-    ANTHROPIC_PROVIDER: TokenPricing(
-        prompt_per_million=3.0,
-        completion_per_million=15.0,
-    ),
-}
-
-
 def normalize_provider(provider: str | None) -> str | None:
     if provider is None:
         return None
@@ -61,6 +51,17 @@ def _response_usage(response: Any) -> Any:
     return getattr(response, "usage", None)
 
 
+def _response_model_name(response: Any) -> str | None:
+    if isinstance(response, dict):
+        value = response.get("model")
+    else:
+        value = getattr(response, "model", None)
+    if value is None:
+        return None
+    normalized = str(value).strip()
+    return normalized or None
+
+
 def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None:
     usage = _response_usage(response)
     prompt_tokens = _read_usage_field(usage, "prompt_tokens")
@@ -110,28 +111,97 @@ def _estimate_cost(
     )
 
 
-def _resolve_token_pricing(
+def _resolve_model_name(
+    response: Any,
+    *,
     provider: str | None,
+    model_name: str | None,
+    model_name_getter: ModelNameGetter | None,
+) -> str | None:
+    explicit_model_name = model_name.strip() if model_name is not None else None
+    if explicit_model_name:
+        candidate = explicit_model_name
+    elif model_name_getter is not None:
+        candidate = model_name_getter(response)
+    else:
+        candidate = _response_model_name(response)
+
+    if candidate is None:
+        return None
+
+    normalized_model_name = str(candidate).strip()
+    if not normalized_model_name:
+        return None
+
+    normalized_provider = normalize_provider(provider)
+    if normalized_provider is not None and "/" not in normalized_model_name:
+        provider_scoped_name = f"{normalized_provider}/{normalized_model_name}"
+        if get_model_pricing(provider_scoped_name) is not None:
+            return provider_scoped_name
+
+    return normalized_model_name
+
+
+def _resolve_token_pricing(
+    response: Any,
     *,
+    provider: str | None,
+    model_name: str | None,
+    model_name_getter: ModelNameGetter | None,
     prompt_price_per_million: float | None,
     completion_price_per_million: float | None,
-    token_pricing: Mapping[str, TokenPricing],
+    model_pricing: Mapping[str, TokenPricing],
 ) -> TokenPricing:
-    normalized_provider = normalize_provider(provider) or DEFAULT_PROVIDER
-    default_pricing = token_pricing.get(
-        normalized_provider,
-        token_pricing[DEFAULT_PROVIDER],
+    explicit_prompt_price = (
+        float(prompt_price_per_million)
+        if prompt_price_per_million is not None
+        else None
+    )
+    explicit_completion_price = (
+        float(completion_price_per_million)
+        if completion_price_per_million is not None
+        else None
     )
+    if (
+        explicit_prompt_price is not None
+        and explicit_completion_price is not None
+    ):
+        return TokenPricing(
+            prompt_per_million=explicit_prompt_price,
+            completion_per_million=explicit_completion_price,
+        )
+
+    resolved_model_name = _resolve_model_name(
+        response,
+        provider=provider,
+        model_name=model_name,
+        model_name_getter=model_name_getter,
+    )
+    if resolved_model_name is None:
+        raise ValueError(
+            "API cost tracking requires model-aware pricing. "
+            "Provide both explicit token prices or supply a model_name "
+            "(or response.model / model_name_getter) with configured pricing."
+        )
+
+    configured_pricing = model_pricing.get(resolved_model_name)
+    if configured_pricing is None:
+        pricing = get_model_pricing(resolved_model_name, strict=True)
+        configured_pricing = TokenPricing(
+            prompt_per_million=pricing.prefill,
+            completion_per_million=pricing.sample,
+        )
+
     return TokenPricing(
         prompt_per_million=(
-            float(prompt_price_per_million)
-            if prompt_price_per_million is not None
-            else default_pricing.prompt_per_million
+            explicit_prompt_price
+            if explicit_prompt_price is not None
+            else configured_pricing.prompt_per_million
         ),
         completion_per_million=(
-            float(completion_price_per_million)
-            if completion_price_per_million is not None
-            else default_pricing.completion_per_million
+            explicit_completion_price
+            if explicit_completion_price is not None
+            else configured_pricing.completion_per_million
         ),
     )
 
@@ -140,10 +210,12 @@ def extract_api_cost(
     response: Any,
     *,
     provider: str | None,
+    model_name: str | None,
+    model_name_getter: ModelNameGetter | None,
     prompt_price_per_million: float | None,
     completion_price_per_million: float | None,
     cost_extractors: Mapping[str, CostExtractor],
-    token_pricing: Mapping[str, TokenPricing],
+    model_pricing: Mapping[str, TokenPricing],
 ) -> float | None:
     provider_name = normalize_provider(provider) or _detect_provider(response)
     if provider_name is not None:
@@ -154,10 +226,13 @@ def extract_api_cost(
                 return float(custom_cost)
 
         pricing = _resolve_token_pricing(
-            provider_name,
+            response,
+            provider=provider_name,
+            model_name=model_name,
+            model_name_getter=model_name_getter,
             prompt_price_per_million=prompt_price_per_million,
             completion_price_per_million=completion_price_per_million,
-            token_pricing=token_pricing,
+            model_pricing=model_pricing,
         )
         if provider_name == OPENAI_PROVIDER:
             return _estimate_cost(_extract_openai_token_counts(response), pricing)
@@ -165,10 +240,13 @@ def extract_api_cost(
             return _estimate_cost(_extract_anthropic_token_counts(response), pricing)
 
     pricing = _resolve_token_pricing(
-        provider_name,
+        response,
+        provider=provider_name,
+        model_name=model_name,
+        model_name_getter=model_name_getter,
         prompt_price_per_million=prompt_price_per_million,
         completion_price_per_million=completion_price_per_million,
-        token_pricing=token_pricing,
+        model_pricing=model_pricing,
     )
     token_counts = _extract_openai_token_counts(response)
     if token_counts is None:
@@ -182,6 +260,8 @@ def _record_api_cost(
     source: str,
     provider: str | None,
     response_getter: ResponseGetter | None,
+    model_name: str | None,
+    model_name_getter: ModelNameGetter | None,
     prompt_price_per_million: float | None,
     completion_price_per_million: float | None,
 ) -> None:
@@ -197,6 +277,8 @@ def _record_api_cost(
         source,
         response,
         provider=provider,
+        model_name=model_name,
+        model_name_getter=model_name_getter,
         prompt_price_per_million=prompt_price_per_million,
         completion_price_per_million=completion_price_per_million,
     )
@@ -206,6 +288,8 @@ def track_api_cost(
     *,
     source: str,
     provider: str | None = None,
+    model_name: str | None = None,
+    model_name_getter: ModelNameGetter | None = None,
     response_getter: ResponseGetter | None = None,
     prompt_price_per_million: float | None = None,
     completion_price_per_million: float | None = None,
@@ -227,6 +311,8 @@ async def _async_wrapper(*args: P.args, **kwargs: P.kwargs):
                     source=normalized_source,
                     provider=normalized_provider,
                     response_getter=response_getter,
+                    model_name=model_name,
+                    model_name_getter=model_name_getter,
                     prompt_price_per_million=prompt_price_per_million,
                     completion_price_per_million=completion_price_per_million,
                 )
@@ -242,6 +328,8 @@ def _sync_wrapper(*args: P.args, **kwargs: P.kwargs):
                 source=normalized_source,
                 provider=normalized_provider,
                 response_getter=response_getter,
+                model_name=model_name,
+                model_name_getter=model_name_getter,
                 prompt_price_per_million=prompt_price_per_million,
                 completion_price_per_million=completion_price_per_million,
             )
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
index d1df45e8..57def527 100644
--- a/tests/unit/test_metrics_builder.py
+++ b/tests/unit/test_metrics_builder.py
@@ -183,17 +183,20 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None:
         assert metrics["costs/cum/all"] == pytest.approx(3.0)
 
     @pytest.mark.asyncio
-    async def test_add_response_cost_uses_registered_pricing(self) -> None:
+    async def test_add_response_cost_uses_registered_model_pricing(self) -> None:
         builder = MetricsBuilder(cost_context="eval")
-        builder.register_token_pricing(
-            "anthropic",
+        builder.register_model_pricing(
+            "anthropic/test-judge",
             prompt_per_million=5.0,
             completion_per_million=7.0,
         )
 
         cost = builder.add_response_cost(
             "llm_judge/faithfulness",
-            {"usage": {"input_tokens": 40, "output_tokens": 60}},
+            {
+                "model": "anthropic/test-judge",
+                "usage": {"input_tokens": 40, "output_tokens": 60},
+            },
         )
 
         metrics = await builder.flush()
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
index b8ac6292..5162595f 100644
--- a/tests/unit/test_track_api_cost.py
+++ b/tests/unit/test_track_api_cost.py
@@ -17,8 +17,15 @@ def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
 
 
 class _OpenAIResponse:
-    def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
+    def __init__(
+        self,
+        prompt_tokens: int,
+        completion_tokens: int,
+        *,
+        model: str | None = None,
+    ) -> None:
         self.usage = _OpenAIUsage(prompt_tokens, completion_tokens)
+        self.model = model
 
 
 class _AnthropicUsage:
@@ -28,8 +35,15 @@ def __init__(self, input_tokens: int, output_tokens: int) -> None:
 
 
 class _AnthropicResponse:
-    def __init__(self, input_tokens: int, output_tokens: int) -> None:
+    def __init__(
+        self,
+        input_tokens: int,
+        output_tokens: int,
+        *,
+        model: str | None = None,
+    ) -> None:
         self.usage = _AnthropicUsage(input_tokens, output_tokens)
+        self.model = model
 
 
 class TestTrackApiCost:
@@ -56,15 +70,18 @@ async def _judge() -> _OpenAIResponse:
         assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002)
 
     @pytest.mark.asyncio
-    async def test_anthropic_cost_extraction_uses_registered_pricing(self) -> None:
+    async def test_anthropic_cost_extraction_uses_registered_model_pricing(self) -> None:
         builder = MetricsBuilder(cost_context="train")
-        builder.register_token_pricing(
-            "anthropic",
+        builder.register_model_pricing(
+            "anthropic/test-judge",
             prompt_per_million=5.0,
             completion_per_million=7.0,
         )
 
-        @track_api_cost(source="llm_judge/faithfulness")
+        @track_api_cost(
+            source="llm_judge/faithfulness",
+            model_name="anthropic/test-judge",
+        )
         async def _judge() -> _AnthropicResponse:
             return _AnthropicResponse(input_tokens=40, output_tokens=60)
 
@@ -77,6 +94,21 @@ async def _judge() -> _AnthropicResponse:
         metrics = await builder.flush()
         assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062)
 
+    @pytest.mark.asyncio
+    async def test_decorator_fails_fast_without_model_aware_pricing(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        @track_api_cost(source="llm_judge/missing_pricing", provider="openai")
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=10, completion_tokens=20)
+
+        token = builder.activate()
+        try:
+            with pytest.raises(ValueError, match="model-aware pricing"):
+                await _judge()
+        finally:
+            token.var.reset(token)
+
     @pytest.mark.asyncio
     async def test_custom_extractor_takes_precedence(self) -> None:
         builder = MetricsBuilder(cost_context="train")

From d2e92131722a6ed57cda5c4c010679ab443e707b Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 15:32:42 -0700
Subject: [PATCH 30/46] fix: Normalize Unsloth Eval Metric Routing

---
 src/art/unsloth/train.py           | 16 ++++++++++------
 tests/unit/test_unsloth_metrics.py | 25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 6 deletions(-)
 create mode 100644 tests/unit/test_unsloth_metrics.py

diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py
index f095fe35..4b8d15d7 100644
--- a/src/art/unsloth/train.py
+++ b/src/art/unsloth/train.py
@@ -12,7 +12,7 @@
 
 from .. import dev
 from ..loss import loss_fn, shift_tensor
-from ..metrics_taxonomy import rename_train_metrics
+from ..metrics_taxonomy import rename_train_metric_key, rename_train_metrics
 from ..types import TrainConfig
 
 if TYPE_CHECKING:
@@ -194,12 +194,16 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None:
         }  # average the metrics
 
         # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
-        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        # start with "eval_". Normalize them into the `val/...` taxonomy instead.
         if next(iter(logs.keys())).startswith("eval_"):
-            metrics = {f"eval_{key}": val for key, val in metrics.items()}
-
-        logs = {**rename_train_metrics(logs), **metrics}
-        results_queue.put_nowait(logs)
+            normalized_metrics = {f"val/{key}": val for key, val in metrics.items()}
+            normalized_logs = {
+                f"val/{rename_train_metric_key(key[len('eval_') :])}": val
+                for key, val in logs.items()
+            }
+            results_queue.put_nowait({**normalized_metrics, **normalized_logs})
+        else:
+            results_queue.put_nowait({**rename_train_metrics(logs), **metrics})
         trainer._metrics["train"].clear()
 
     return log
diff --git a/tests/unit/test_unsloth_metrics.py b/tests/unit/test_unsloth_metrics.py
new file mode 100644
index 00000000..fdb91b0c
--- /dev/null
+++ b/tests/unit/test_unsloth_metrics.py
@@ -0,0 +1,25 @@
+import asyncio
+from collections import defaultdict
+
+from art.unsloth.train import get_log_fn
+
+
+class _DummyTrainer:
+    def __init__(self) -> None:
+        self._metrics = {"train": defaultdict(list)}
+
+
+def test_get_log_fn_routes_eval_metrics_to_val_namespace() -> None:
+    trainer = _DummyTrainer()
+    trainer._metrics["train"]["loss/train"].append(1.5)
+    trainer._metrics["train"]["loss/entropy"].append(0.2)
+    results_queue: asyncio.Queue[dict[str, float]] = asyncio.Queue()
+
+    log = get_log_fn(trainer, results_queue)
+    log({"eval_loss": 1.0, "eval_runtime": 2.0})
+
+    assert results_queue.get_nowait() == {
+        "val/loss/train": 1.0,
+        "val/loss/entropy": 0.2,
+        "val/runtime": 2.0,
+    }

From 5c46148b4bd8b3b537cf5e8644e7d1828dfe927c Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 15:56:36 -0700
Subject: [PATCH 31/46] fix: Align Wandb Logging With Training Step

---
 src/art/model.py                  | 5 ++++-
 tests/unit/test_metric_routing.py | 7 +++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/art/model.py b/src/art/model.py
index aba894a0..b844f399 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -479,7 +479,10 @@ def _log_metrics(
         if should_log_wandb:
             if run := self._get_wandb_run():
                 self._define_wandb_step_metrics(prefixed.keys())
-                run.log(prefixed)
+                # Keep W&B's internal step aligned with ART's training_step so
+                # multiple log calls for the same training step do not inflate
+                # the run's step count.
+                run.log(prefixed, step=step)
 
     def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None:
         import wandb
diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py
index d7dcd2b5..4d6316d0 100644
--- a/tests/unit/test_metric_routing.py
+++ b/tests/unit/test_metric_routing.py
@@ -112,3 +112,10 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step(
             (("costs/cum/train/prefill",), {"step_metric": "training_step"})
             in define_calls
         )
+        fake_run.log.assert_called_once()
+        logged_metrics = fake_run.log.call_args.args[0]
+        assert logged_metrics["costs/train/sample"] == 0.1
+        assert logged_metrics["costs/cum/train/prefill"] == 0.2
+        assert logged_metrics["training_step"] == 1
+        assert "time/wall_clock_sec" in logged_metrics
+        assert fake_run.log.call_args.kwargs == {"step": 1}

From ad51d340f11688e7d903390a2496f83aee2c6799 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 15:56:41 -0700
Subject: [PATCH 32/46] refactor: Use Backend Train In Metrics Demo

---
 dev/yes-no-maybe-metrics.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py
index 32729990..036e3847 100644
--- a/dev/yes-no-maybe-metrics.py
+++ b/dev/yes-no-maybe-metrics.py
@@ -238,19 +238,22 @@ async def main() -> None:
                 train_builder.add_data(
                     step_actor_tokens=total_actor_tokens(train_groups)
                 )
-                await model.train(
+                result = await backend.train(
+                    model,
                     train_groups,
-                    config=art.TrainConfig(learning_rate=learning_rate),
+                    learning_rate=learning_rate,
                 )
 
-            step = await model.get_step()
             await model.log(
-                trajectories=None,
                 split="train",
-                step=step,
-                metrics={"time/step_wall_s": time.monotonic() - step_started},
+                step=result.step,
+                trajectories=train_groups,
+                metrics={
+                    **result.metrics,
+                    "time/step_wall_s": time.monotonic() - step_started,
+                },
             )
-            print(f"step {step} complete")
+            print(f"step {result.step} complete")
 
         print_history_summary(model)
     finally:

From fe4a06b29f92f35b680d70abe4f4559c09464ab8 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 16:35:54 -0700
Subject: [PATCH 33/46] feat: Add LocalBackend wall time and GPU cost metrics

---
 docs/metrics-taxonomy.md            |  8 ++-
 src/art/local/backend.py            | 72 ++++++++++++++++++++++-
 src/art/model.py                    | 73 +++++++++++++++++++++--
 tests/unit/test_frontend_logging.py | 89 +++++++++++++++++++++++++++++
 4 files changed, 234 insertions(+), 8 deletions(-)

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
index bc4f7980..bb9ee871 100644
--- a/docs/metrics-taxonomy.md
+++ b/docs/metrics-taxonomy.md
@@ -55,16 +55,22 @@ ART now emits the following metrics from library internals where the data is ava
 - `loss/*` from trainer backends
 - `time/wall_clock_sec` and `training_step` on every logged row
 - `time/step_trainer_s` for training calls
-- `time/step_wall_s`, `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer`
+- `time/step_wall_s` from `PipelineTrainer` and `LocalBackend` train-step logs
+- `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer`
 - `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted`
 - `data/step_num_groups_trainable` for train splits
 - `data/cum/num_unique_scenarios` when scenario IDs are present in group or trajectory metadata
 - `data/step_trainer_tokens` where the backend knows the trainer token count
+- `costs/gpu` on `LocalBackend` train-step logs when ART can resolve GPU pricing
 - `throughput/cum/trainer_idle_s`, `throughput/cum/actor_idle_s`
 - `throughput/avg_trainer_tok_per_s`, `throughput/avg_actor_tok_per_s` when both token and time inputs are available
 
 Some metrics remain user-owned because ART cannot infer them reliably for every workflow, especially actor token usage outside the pipeline trainer.
 
+For automatic GPU cost on `LocalBackend`, ART currently auto-detects H200s at
+$3/hour per GPU. For other GPU types, pass `gpu_cost_per_hour_usd=...` to
+`LocalBackend(...)` if you want ART to emit `costs/gpu` instead of skipping it.
+
 ## User Helpers
 
 Use the builder helpers for step-level metrics that only user code can know:
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index 1d839cf7..807a0f42 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -13,6 +13,10 @@
 
 logger = logging.getLogger(__name__)
 
+_AUTO_GPU_HOURLY_PRICING_USD = {
+    "H200": 3.0,
+}
+
 import aiohttp
 import numpy as np
 from openai import AsyncOpenAI
@@ -68,7 +72,13 @@
 
 
 class LocalBackend(Backend):
-    def __init__(self, *, in_process: bool = False, path: str | None = None) -> None:
+    def __init__(
+        self,
+        *,
+        in_process: bool = False,
+        path: str | None = None,
+        gpu_cost_per_hour_usd: float | None = None,
+    ) -> None:
         """
         Initializes a local, directory-based Backend interface at the given path.
 
@@ -79,9 +89,18 @@ def __init__(self, *, in_process: bool = False, path: str | None = None) -> None
         Args:
             in_process: Whether to run the local service in-process.
             path: The path to the local directory. Defaults to "{repo_root}/.art".
+            gpu_cost_per_hour_usd: Optional per-GPU hourly price override used for
+                automatic `costs/gpu` accounting on train steps. When unset,
+                ART auto-detects supported GPU types (H200 at $3/hr today) and
+                skips GPU cost logging for unknown devices instead of guessing.
         """
         self._in_process = in_process
         self._path = path or get_default_art_path()
+        self._gpu_cost_per_hour_usd = (
+            float(gpu_cost_per_hour_usd)
+            if gpu_cost_per_hour_usd is not None
+            else None
+        )
         os.makedirs(self._path, exist_ok=True)
 
         # Other initialization
@@ -89,6 +108,57 @@ def __init__(self, *, in_process: bool = False, path: str | None = None) -> None
         self._tokenizers: dict[str, PreTrainedTokenizerBase] = {}
         self._image_processors: dict[str, BaseImageProcessor | None] = {}
 
+    def supports_automatic_train_step_metrics(self) -> bool:
+        return True
+
+    def automatic_gpu_cost_per_hour_usd(self, model: Model) -> float | None:
+        per_gpu_cost = self._resolve_gpu_cost_per_hour_usd()
+        if per_gpu_cost is None:
+            return None
+
+        gpu_count = self._allocated_gpu_count(model)
+        if gpu_count <= 0:
+            return None
+        return per_gpu_cost * gpu_count
+
+    def _resolve_gpu_cost_per_hour_usd(self) -> float | None:
+        if self._gpu_cost_per_hour_usd is not None:
+            return self._gpu_cost_per_hour_usd
+        if not torch.cuda.is_available():
+            return None
+
+        num_visible_gpus = torch.cuda.device_count()
+        if num_visible_gpus <= 0:
+            return None
+
+        resolved_costs: list[float] = []
+        for index in range(num_visible_gpus):
+            device_name = torch.cuda.get_device_name(index).upper()
+            for gpu_name, hourly_cost in _AUTO_GPU_HOURLY_PRICING_USD.items():
+                if gpu_name in device_name:
+                    resolved_costs.append(hourly_cost)
+                    break
+            else:
+                return None
+
+        if not resolved_costs:
+            return None
+        if len(set(resolved_costs)) != 1:
+            return None
+        return resolved_costs[0]
+
+    def _allocated_gpu_count(self, model: Model) -> int:
+        if isinstance(model, TrainableModel) and model._internal_config is not None:
+            trainer_gpu_ids = set(model._internal_config.get("trainer_gpu_ids", []))
+            inference_gpu_ids = set(model._internal_config.get("inference_gpu_ids", []))
+            allocated_gpu_ids = trainer_gpu_ids | inference_gpu_ids
+            if allocated_gpu_ids:
+                return len(allocated_gpu_ids)
+
+        if not torch.cuda.is_available():
+            return 0
+        return torch.cuda.device_count()
+
     def __enter__(self) -> Self:
         return self
 
diff --git a/src/art/model.py b/src/art/model.py
index b844f399..62b4fcd8 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -118,6 +118,9 @@ class Model(
     _wandb_run: Optional["Run"] = None  # Private, for lazy wandb initialization
     _wandb_defined_metrics: set[str]
     _run_start_time: float
+    _run_start_monotonic: float
+    _last_local_train_log_monotonic: float
+    _last_local_train_step: int | None
     _metrics_builder: MetricsBuilder
     _metrics_builder_state_loaded: bool
     _cost_calculator: CostCalculator
@@ -151,6 +154,9 @@ def __init__(
         )
         object.__setattr__(self, "_wandb_defined_metrics", set())
         object.__setattr__(self, "_run_start_time", time.time())
+        object.__setattr__(self, "_run_start_monotonic", time.monotonic())
+        object.__setattr__(self, "_last_local_train_log_monotonic", self._run_start_monotonic)
+        object.__setattr__(self, "_last_local_train_step", None)
         object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train"))
         object.__setattr__(self, "_metrics_builder_state_loaded", False)
 
@@ -495,7 +501,7 @@ def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None:
             wandb.define_metric(key, step_metric="training_step")
             self._wandb_defined_metrics.add(key)
 
-    def _route_metrics_and_extract_non_costs(
+    def _route_metrics_and_collect_non_costs(
         self, metrics: dict[str, float], split: str
     ) -> dict[str, float]:
         non_cost_metrics: dict[str, float] = {}
@@ -523,6 +529,44 @@ def _route_metrics_and_extract_non_costs(
             non_cost_metrics[metric] = numeric_value
         return non_cost_metrics
 
+    def _collect_automatic_backend_metrics(
+        self,
+        *,
+        split: str,
+        step: int,
+        provided_metric_keys: set[str],
+    ) -> dict[str, float]:
+        if split != "train" or self._backend is None:
+            return {}
+
+        supports_step_metrics = getattr(
+            self._backend, "supports_automatic_train_step_metrics", None
+        )
+        if not callable(supports_step_metrics) or not supports_step_metrics():
+            return {}
+
+        if self._last_local_train_step == step:
+            return {}
+
+        now = time.monotonic()
+        step_wall_s = max(0.0, now - self._last_local_train_log_monotonic)
+        object.__setattr__(self, "_last_local_train_log_monotonic", now)
+        object.__setattr__(self, "_last_local_train_step", step)
+
+        automatic_metrics: dict[str, float] = {}
+        if "time/step_wall_s" not in provided_metric_keys:
+            automatic_metrics["time/step_wall_s"] = step_wall_s
+
+        gpu_cost_getter = getattr(self._backend, "automatic_gpu_cost_per_hour_usd", None)
+        if callable(gpu_cost_getter) and "costs/gpu" not in provided_metric_keys:
+            gpu_cost_per_hour_usd = gpu_cost_getter(self)
+            if gpu_cost_per_hour_usd is not None:
+                automatic_metrics["costs/gpu"] = (
+                    step_wall_s * float(gpu_cost_per_hour_usd) / 3600.0
+                )
+
+        return automatic_metrics
+
     def _add_default_step_metrics(
         self,
         trajectory_groups: list[TrajectoryGroup],
@@ -635,7 +679,15 @@ async def log(
         # If only metrics provided (no trajectories), just log them and return
         if trajectories is None:
             if metrics is not None:
-                metrics_without_costs = self._route_metrics_and_extract_non_costs(
+                provided_metric_keys = set(metrics)
+                automatic_metrics = self._collect_automatic_backend_metrics(
+                    split=split,
+                    step=step,
+                    provided_metric_keys=provided_metric_keys,
+                )
+                if automatic_metrics:
+                    self._route_metrics_and_collect_non_costs(automatic_metrics, split)
+                metrics_without_costs = self._route_metrics_and_collect_non_costs(
                     metrics, split
                 )
                 builder_metrics = await self._metrics_builder.flush()
@@ -646,11 +698,20 @@ async def log(
             return
 
         trajectory_groups = self._normalize_trajectory_groups(trajectories)
+        provided_metric_keys = set(metrics or {})
+
+        automatic_metrics = self._collect_automatic_backend_metrics(
+            split=split,
+            step=step,
+            provided_metric_keys=provided_metric_keys,
+        )
+        if automatic_metrics:
+            self._route_metrics_and_collect_non_costs(automatic_metrics, split)
 
         default_train_metrics = self._add_default_step_metrics(
             trajectory_groups,
             split=split,
-            provided_metric_keys=set(metrics or {}),
+            provided_metric_keys=provided_metric_keys,
         )
 
         # Ensure output directories exist
@@ -679,7 +740,7 @@ async def log(
 
         for group in trajectory_groups:
             if group.metrics:
-                group_non_cost = self._route_metrics_and_extract_non_costs(
+                group_non_cost = self._route_metrics_and_collect_non_costs(
                     cast(dict[str, float], group.metrics), split
                 )
             else:
@@ -704,7 +765,7 @@ async def log(
                         routed_metric = f"reward/{routed_metric}"
                     trajectory_metrics[routed_metric] = float(value)
 
-                non_cost_trajectory_metrics = self._route_metrics_and_extract_non_costs(
+                non_cost_trajectory_metrics = self._route_metrics_and_collect_non_costs(
                     trajectory_metrics,
                     split,
                 )
@@ -738,7 +799,7 @@ async def log(
 
         # Merge in any additional metrics passed directly
         if metrics is not None:
-            metrics_without_costs = self._route_metrics_and_extract_non_costs(
+            metrics_without_costs = self._route_metrics_and_collect_non_costs(
                 metrics, split
             )
             averages.update(metrics_without_costs)
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 755d7e64..c0adcb40 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -898,6 +898,95 @@ def test_should_log_wandb_logic_empty_list(self, tmp_path: Path):
             assert should_log is False
 
 
+class TestLocalBackendAutomaticMetrics:
+    @pytest.mark.asyncio
+    async def test_train_logs_automatic_wall_time_and_gpu_cost(
+        self, tmp_path: Path
+    ) -> None:
+        backend = LocalBackend(gpu_cost_per_hour_usd=3.0)
+
+        with patch("art.model.time.monotonic", side_effect=[100.0, 106.0, 111.0]):
+            model = TrainableModel(
+                name="test-model",
+                project="test-project",
+                base_model="Qwen/Qwen3-4B-Instruct-2507",
+                base_path=str(tmp_path),
+                report_metrics=[],
+                _internal_config={"trainer_gpu_ids": [0]},
+            )
+            model._backend = backend
+
+            await model.log(
+                trajectories=None,
+                split="train",
+                step=1,
+                metrics={"loss/train": 1.0},
+            )
+            await model.log(
+                trajectories=None,
+                split="train",
+                step=2,
+                metrics={"loss/train": 0.5},
+            )
+
+        history_path = tmp_path / "test-project/models/test-model/history.jsonl"
+        rows = [json.loads(line) for line in history_path.open() if line.strip()]
+
+        first_gpu_cost = 6.0 * 3.0 / 3600.0
+        second_gpu_cost = 5.0 * 3.0 / 3600.0
+
+        assert rows[0]["time/step_wall_s"] == pytest.approx(6.0)
+        assert rows[0]["costs/gpu"] == pytest.approx(first_gpu_cost)
+        assert rows[0]["costs/all"] == pytest.approx(first_gpu_cost)
+        assert rows[0]["costs/cum/gpu"] == pytest.approx(first_gpu_cost)
+
+        assert rows[1]["time/step_wall_s"] == pytest.approx(5.0)
+        assert rows[1]["costs/gpu"] == pytest.approx(second_gpu_cost)
+        assert rows[1]["costs/cum/gpu"] == pytest.approx(
+            first_gpu_cost + second_gpu_cost
+        )
+        assert rows[1]["costs/cum/all"] == pytest.approx(
+            first_gpu_cost + second_gpu_cost
+        )
+
+    @pytest.mark.asyncio
+    async def test_unknown_local_gpu_skips_cost_but_keeps_wall_time(
+        self, tmp_path: Path
+    ) -> None:
+        backend = LocalBackend()
+
+        with patch("art.model.time.monotonic", side_effect=[50.0, 55.0]):
+            with patch("art.local.backend.torch.cuda.is_available", return_value=True):
+                with patch("art.local.backend.torch.cuda.device_count", return_value=1):
+                    with patch(
+                        "art.local.backend.torch.cuda.get_device_name",
+                        return_value="NVIDIA A100-SXM4-80GB",
+                    ):
+                        model = TrainableModel(
+                            name="test-model",
+                            project="test-project",
+                            base_model="Qwen/Qwen3-4B-Instruct-2507",
+                            base_path=str(tmp_path),
+                            report_metrics=[],
+                            _internal_config={"trainer_gpu_ids": [0]},
+                        )
+                        model._backend = backend
+                        await model.log(
+                            trajectories=None,
+                            split="train",
+                            step=1,
+                            metrics={"loss/train": 1.0},
+                        )
+
+        history_path = tmp_path / "test-project/models/test-model/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["time/step_wall_s"] == pytest.approx(5.0)
+        assert "costs/gpu" not in entry
+        assert "costs/all" not in entry
+
+
 class TestModelAttributes:
     """Test new Model attributes."""
 

From 096c04235beeb5c128f4b8a0fdee8af2398986c6 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Mon, 9 Mar 2026 16:35:59 -0700
Subject: [PATCH 34/46] refactor: Rely On LocalBackend metrics in demo

---
 dev/yes-no-maybe-metrics.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py
index 036e3847..cbb0c5fd 100644
--- a/dev/yes-no-maybe-metrics.py
+++ b/dev/yes-no-maybe-metrics.py
@@ -2,7 +2,8 @@
 
 This keeps the same prompt family, rollout structure, and reward ordering as
 `dev/yes-no-maybe.py` while adding explicit metrics taxonomy instrumentation for
-actor/eval timing and data metrics.
+actor/eval timing and data metrics, while relying on LocalBackend for automatic
+step wall time and GPU cost logging.
 """
 
 from __future__ import annotations
@@ -217,7 +218,6 @@ async def main() -> None:
                 await model.log(val_groups, split="val", step=current_step)
 
             train_builder = model.metrics_builder("train")
-            step_started = time.monotonic()
             with train_builder.activate_context():
                 with train_builder.measure("time/step_actor_s"):
                     train_groups = await art.gather_trajectory_groups(
@@ -248,10 +248,7 @@ async def main() -> None:
                 split="train",
                 step=result.step,
                 trajectories=train_groups,
-                metrics={
-                    **result.metrics,
-                    "time/step_wall_s": time.monotonic() - step_started,
-                },
+                metrics=result.metrics,
             )
             print(f"step {result.step} complete")
 

From 67ff7262588dcba82a2c53b8dfccdb8e03c082ad Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 10:34:23 -0700
Subject: [PATCH 35/46] fix: preserve out-of-order wandb metric logging

---
 src/art/model.py                  | 8 ++++----
 tests/unit/test_metric_routing.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/art/model.py b/src/art/model.py
index 62b4fcd8..b4d7bdef 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -485,10 +485,10 @@ def _log_metrics(
         if should_log_wandb:
             if run := self._get_wandb_run():
                 self._define_wandb_step_metrics(prefixed.keys())
-                # Keep W&B's internal step aligned with ART's training_step so
-                # multiple log calls for the same training step do not inflate
-                # the run's step count.
-                run.log(prefixed, step=step)
+                # Let W&B use its own monotonically increasing history step.
+                # ART's `training_step` remains the x-axis via define_metric,
+                # which preserves out-of-order eval logging.
+                run.log(prefixed)
 
     def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None:
         import wandb
diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py
index 4d6316d0..f9904527 100644
--- a/tests/unit/test_metric_routing.py
+++ b/tests/unit/test_metric_routing.py
@@ -118,4 +118,4 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step(
         assert logged_metrics["costs/cum/train/prefill"] == 0.2
         assert logged_metrics["training_step"] == 1
         assert "time/wall_clock_sec" in logged_metrics
-        assert fake_run.log.call_args.kwargs == {"step": 1}
+        assert fake_run.log.call_args.kwargs == {}

From fab790762ed5618d191eccd800b91e2e1e3aeb59 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 10:34:45 -0700
Subject: [PATCH 36/46] fix: account for cached API token pricing

---
 src/art/metrics.py                |  24 ++
 src/art/metrics_api_cost.py       | 365 ++++++++++++++++++++++++++----
 tests/unit/test_track_api_cost.py | 188 ++++++++++++++-
 3 files changed, 527 insertions(+), 50 deletions(-)

diff --git a/src/art/metrics.py b/src/art/metrics.py
index 29946316..d2ba6358 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -99,6 +99,9 @@ def add_response_cost(
         model_name_getter: "ModelNameGetter | None" = None,
         prompt_price_per_million: float | None = None,
         completion_price_per_million: float | None = None,
+        cached_prompt_price_per_million: float | None = None,
+        cache_creation_price_per_million: float | None = None,
+        cache_read_price_per_million: float | None = None,
     ) -> float | None:
         normalized_source = source.strip("/")
         if not normalized_source:
@@ -111,6 +114,9 @@ def add_response_cost(
             model_name_getter=model_name_getter,
             prompt_price_per_million=prompt_price_per_million,
             completion_price_per_million=completion_price_per_million,
+            cached_prompt_price_per_million=cached_prompt_price_per_million,
+            cache_creation_price_per_million=cache_creation_price_per_million,
+            cache_read_price_per_million=cache_read_price_per_million,
             cost_extractors=self._shared_state.cost_extractors,
             model_pricing=self._shared_state.model_pricing,
         )
@@ -248,6 +254,9 @@ def register_model_pricing(
         *,
         prompt_per_million: float,
         completion_per_million: float,
+        cached_prompt_per_million: float | None = None,
+        cache_creation_per_million: float | None = None,
+        cache_read_per_million: float | None = None,
     ) -> None:
         normalized_model_name = model_name.strip()
         if not normalized_model_name:
@@ -255,6 +264,21 @@ def register_model_pricing(
         self._shared_state.model_pricing[normalized_model_name] = TokenPricing(
             prompt_per_million=float(prompt_per_million),
             completion_per_million=float(completion_per_million),
+            cached_prompt_per_million=(
+                float(cached_prompt_per_million)
+                if cached_prompt_per_million is not None
+                else None
+            ),
+            cache_creation_per_million=(
+                float(cache_creation_per_million)
+                if cache_creation_per_million is not None
+                else None
+            ),
+            cache_read_per_million=(
+                float(cache_read_per_million)
+                if cache_read_per_million is not None
+                else None
+            ),
         )
 
     def state_dict(self) -> dict[str, Any]:
diff --git a/src/art/metrics_api_cost.py b/src/art/metrics_api_cost.py
index 6d713192..a98332f8 100644
--- a/src/art/metrics_api_cost.py
+++ b/src/art/metrics_api_cost.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass
 from functools import wraps
 from inspect import iscoroutinefunction
+import re
 from typing import Any, ParamSpec, TypeVar
 
 from .costs import get_model_pricing, tokens_to_cost
@@ -23,6 +24,53 @@
 class TokenPricing:
     prompt_per_million: float
     completion_per_million: float
+    cached_prompt_per_million: float | None = None
+    cache_creation_per_million: float | None = None
+    cache_read_per_million: float | None = None
+
+
+@dataclass(frozen=True)
+class _OpenAITokenUsage:
+    prompt_tokens: float
+    completion_tokens: float
+    cached_prompt_tokens: float
+
+
+@dataclass(frozen=True)
+class _AnthropicTokenUsage:
+    input_tokens: float
+    output_tokens: float
+    cache_creation_input_tokens: float
+    cache_read_input_tokens: float
+
+
+_DEFAULT_TOKEN_PRICING: dict[str, TokenPricing] = {
+    "openai/gpt-4.1": TokenPricing(
+        prompt_per_million=2.0,
+        completion_per_million=8.0,
+        cached_prompt_per_million=0.5,
+    ),
+    "anthropic/claude-sonnet-4-6": TokenPricing(
+        prompt_per_million=3.0,
+        completion_per_million=15.0,
+        cache_creation_per_million=3.75,
+        cache_read_per_million=0.30,
+    ),
+}
+
+
+def _default_token_pricing(model_name: str) -> TokenPricing | None:
+    explicit = _DEFAULT_TOKEN_PRICING.get(model_name)
+    if explicit is not None:
+        return explicit
+
+    pricing = get_model_pricing(model_name)
+    if pricing is None:
+        return None
+    return TokenPricing(
+        prompt_per_million=pricing.prefill,
+        completion_per_million=pricing.sample,
+    )
 
 def normalize_provider(provider: str | None) -> str | None:
     if provider is None:
@@ -45,6 +93,20 @@ def _read_usage_field(usage: Any, field: str) -> float | None:
     return float(value)
 
 
+def _read_usage_nested_field(usage: Any, *fields: str) -> float | None:
+    current = usage
+    for field in fields:
+        if current is None:
+            return None
+        if isinstance(current, dict):
+            current = current.get(field)
+        else:
+            current = getattr(current, field, None)
+    if current is None:
+        return None
+    return float(current)
+
+
 def _response_usage(response: Any) -> Any:
     if isinstance(response, dict):
         return response.get("usage")
@@ -62,22 +124,50 @@ def _response_model_name(response: Any) -> str | None:
     return normalized or None
 
 
-def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None:
+def _extract_openai_token_counts(response: Any) -> _OpenAITokenUsage | None:
     usage = _response_usage(response)
     prompt_tokens = _read_usage_field(usage, "prompt_tokens")
     completion_tokens = _read_usage_field(usage, "completion_tokens")
-    if prompt_tokens is None and completion_tokens is None:
+    cached_prompt_tokens = (
+        _read_usage_nested_field(usage, "prompt_tokens_details", "cached_tokens") or 0.0
+    )
+    if (
+        prompt_tokens is None
+        and completion_tokens is None
+        and cached_prompt_tokens == 0.0
+    ):
         return None
-    return prompt_tokens or 0.0, completion_tokens or 0.0
+    total_prompt_tokens = prompt_tokens or 0.0
+    return _OpenAITokenUsage(
+        prompt_tokens=total_prompt_tokens,
+        completion_tokens=completion_tokens or 0.0,
+        cached_prompt_tokens=min(cached_prompt_tokens, total_prompt_tokens),
+    )
 
 
-def _extract_anthropic_token_counts(response: Any) -> tuple[float, float] | None:
+def _extract_anthropic_token_counts(response: Any) -> _AnthropicTokenUsage | None:
     usage = _response_usage(response)
     input_tokens = _read_usage_field(usage, "input_tokens")
     output_tokens = _read_usage_field(usage, "output_tokens")
-    if input_tokens is None and output_tokens is None:
+    cache_creation_input_tokens = (
+        _read_usage_field(usage, "cache_creation_input_tokens") or 0.0
+    )
+    cache_read_input_tokens = (
+        _read_usage_field(usage, "cache_read_input_tokens") or 0.0
+    )
+    if (
+        input_tokens is None
+        and output_tokens is None
+        and cache_creation_input_tokens == 0.0
+        and cache_read_input_tokens == 0.0
+    ):
         return None
-    return input_tokens or 0.0, output_tokens or 0.0
+    return _AnthropicTokenUsage(
+        input_tokens=input_tokens or 0.0,
+        output_tokens=output_tokens or 0.0,
+        cache_creation_input_tokens=cache_creation_input_tokens,
+        cache_read_input_tokens=cache_read_input_tokens,
+    )
 
 
 def _detect_provider(response: Any) -> str | None:
@@ -98,16 +188,145 @@ def _detect_provider(response: Any) -> str | None:
     return None
 
 
-def _estimate_cost(
-    token_counts: tuple[float, float] | None,
+def _estimate_openai_cost(
+    token_counts: _OpenAITokenUsage | None,
     pricing: TokenPricing,
 ) -> float | None:
     if token_counts is None:
         return None
-    prompt_tokens, completion_tokens = token_counts
-    return tokens_to_cost(prompt_tokens, pricing.prompt_per_million) + tokens_to_cost(
-        completion_tokens,
-        pricing.completion_per_million,
+    uncached_prompt_tokens = max(
+        token_counts.prompt_tokens - token_counts.cached_prompt_tokens,
+        0.0,
+    )
+    cached_prompt_price = (
+        pricing.cached_prompt_per_million
+        if pricing.cached_prompt_per_million is not None
+        else pricing.prompt_per_million
+    )
+    return (
+        tokens_to_cost(uncached_prompt_tokens, pricing.prompt_per_million)
+        + tokens_to_cost(
+            token_counts.cached_prompt_tokens,
+            cached_prompt_price,
+        )
+        + tokens_to_cost(
+            token_counts.completion_tokens,
+            pricing.completion_per_million,
+        )
+    )
+
+
+def _estimate_anthropic_cost(
+    token_counts: _AnthropicTokenUsage | None,
+    pricing: TokenPricing,
+) -> float | None:
+    if token_counts is None:
+        return None
+    cache_creation_price = (
+        pricing.cache_creation_per_million
+        if pricing.cache_creation_per_million is not None
+        else pricing.prompt_per_million
+    )
+    cache_read_price = (
+        pricing.cache_read_per_million
+        if pricing.cache_read_per_million is not None
+        else pricing.prompt_per_million
+    )
+    return (
+        tokens_to_cost(token_counts.input_tokens, pricing.prompt_per_million)
+        + tokens_to_cost(
+            token_counts.cache_creation_input_tokens,
+            cache_creation_price,
+        )
+        + tokens_to_cost(
+            token_counts.cache_read_input_tokens,
+            cache_read_price,
+        )
+        + tokens_to_cost(
+            token_counts.output_tokens,
+            pricing.completion_per_million,
+        )
+    )
+
+
+def _strip_snapshot_suffix(model_name: str) -> str:
+    for pattern in (
+        r"^(.*)-\d{4}-\d{2}-\d{2}$",
+        r"^(.*)-\d{8}$",
+    ):
+        match = re.match(pattern, model_name)
+        if match is not None:
+            return match.group(1)
+    return model_name
+
+
+def _candidate_model_names(
+    normalized_model_name: str,
+    *,
+    provider: str | None,
+) -> list[str]:
+    candidates: list[str] = []
+
+    def _append(candidate: str | None) -> None:
+        if candidate and candidate not in candidates:
+            candidates.append(candidate)
+
+    _append(normalized_model_name)
+    _append(_strip_snapshot_suffix(normalized_model_name))
+
+    if provider is not None and "/" not in normalized_model_name:
+        _append(f"{provider}/{normalized_model_name}")
+        _append(f"{provider}/{_strip_snapshot_suffix(normalized_model_name)}")
+
+    return candidates
+
+
+def _resolve_registered_or_default_pricing(
+    model_name: str,
+    *,
+    model_pricing: Mapping[str, TokenPricing],
+) -> TokenPricing | None:
+    registered = model_pricing.get(model_name)
+    if registered is not None:
+        return registered
+    return _default_token_pricing(model_name)
+
+
+def _merge_token_pricing(
+    *,
+    base_pricing: TokenPricing,
+    prompt_price_per_million: float | None,
+    completion_price_per_million: float | None,
+    cached_prompt_price_per_million: float | None,
+    cache_creation_price_per_million: float | None,
+    cache_read_price_per_million: float | None,
+) -> TokenPricing:
+    return TokenPricing(
+        prompt_per_million=(
+            float(prompt_price_per_million)
+            if prompt_price_per_million is not None
+            else base_pricing.prompt_per_million
+        ),
+        completion_per_million=(
+            float(completion_price_per_million)
+            if completion_price_per_million is not None
+            else base_pricing.completion_per_million
+        ),
+        cached_prompt_per_million=(
+            float(cached_prompt_price_per_million)
+            if cached_prompt_price_per_million is not None
+            else base_pricing.cached_prompt_per_million
+        ),
+        cache_creation_per_million=(
+            float(cache_creation_price_per_million)
+            if cache_creation_price_per_million is not None
+            else base_pricing.cache_creation_per_million
+        ),
+        cache_read_per_million=(
+            float(cache_read_price_per_million)
+            if cache_read_price_per_million is not None
+            else base_pricing.cache_read_per_million
+        ),
     )
 
 
@@ -117,6 +336,7 @@ def _resolve_model_name(
     provider: str | None,
     model_name: str | None,
     model_name_getter: ModelNameGetter | None,
+    model_pricing: Mapping[str, TokenPricing],
 ) -> str | None:
     explicit_model_name = model_name.strip() if model_name is not None else None
     if explicit_model_name:
@@ -134,11 +354,19 @@ def _resolve_model_name(
         return None
 
     normalized_provider = normalize_provider(provider)
-    if normalized_provider is not None and "/" not in normalized_model_name:
-        provider_scoped_name = f"{normalized_provider}/{normalized_model_name}"
-        if get_model_pricing(provider_scoped_name) is not None:
-            return provider_scoped_name
+    candidates = _candidate_model_names(
+        normalized_model_name,
+        provider=normalized_provider,
+    )
+    for candidate in candidates:
+        if _resolve_registered_or_default_pricing(
+            candidate,
+            model_pricing=model_pricing,
+        ) is not None:
+            return candidate
 
+    if normalized_provider is not None and "/" not in normalized_model_name:
+        return f"{normalized_provider}/{normalized_model_name}"
     return normalized_model_name
 
 
@@ -150,6 +378,9 @@ def _resolve_token_pricing(
     model_name_getter: ModelNameGetter | None,
     prompt_price_per_million: float | None,
     completion_price_per_million: float | None,
+    cached_prompt_price_per_million: float | None,
+    cache_creation_price_per_million: float | None,
+    cache_read_price_per_million: float | None,
     model_pricing: Mapping[str, TokenPricing],
 ) -> TokenPricing:
     explicit_prompt_price = (
@@ -162,47 +393,61 @@ def _resolve_token_pricing(
         if completion_price_per_million is not None
         else None
     )
-    if (
-        explicit_prompt_price is not None
-        and explicit_completion_price is not None
-    ):
-        return TokenPricing(
-            prompt_per_million=explicit_prompt_price,
-            completion_per_million=explicit_completion_price,
-        )
+    explicit_cached_prompt_price = (
+        float(cached_prompt_price_per_million)
+        if cached_prompt_price_per_million is not None
+        else None
+    )
+    explicit_cache_creation_price = (
+        float(cache_creation_price_per_million)
+        if cache_creation_price_per_million is not None
+        else None
+    )
+    explicit_cache_read_price = (
+        float(cache_read_price_per_million)
+        if cache_read_price_per_million is not None
+        else None
+    )
 
     resolved_model_name = _resolve_model_name(
         response,
         provider=provider,
         model_name=model_name,
         model_name_getter=model_name_getter,
+        model_pricing=model_pricing,
     )
     if resolved_model_name is None:
+        if explicit_prompt_price is not None and explicit_completion_price is not None:
+            return TokenPricing(
+                prompt_per_million=explicit_prompt_price,
+                completion_per_million=explicit_completion_price,
+                cached_prompt_per_million=explicit_cached_prompt_price,
+                cache_creation_per_million=explicit_cache_creation_price,
+                cache_read_per_million=explicit_cache_read_price,
+            )
         raise ValueError(
             "API cost tracking requires model-aware pricing. "
             "Provide both explicit token prices or supply a model_name "
             "(or response.model / model_name_getter) with configured pricing."
         )
 
-    configured_pricing = model_pricing.get(resolved_model_name)
+    configured_pricing = _resolve_registered_or_default_pricing(
+        resolved_model_name,
+        model_pricing=model_pricing,
+    )
     if configured_pricing is None:
-        pricing = get_model_pricing(resolved_model_name, strict=True)
-        configured_pricing = TokenPricing(
-            prompt_per_million=pricing.prefill,
-            completion_per_million=pricing.sample,
+        raise ValueError(
+            f"No pricing configured for model '{resolved_model_name}'. "
+            "Provide explicit token prices or register model pricing."
         )
 
-    return TokenPricing(
-        prompt_per_million=(
-            explicit_prompt_price
-            if explicit_prompt_price is not None
-            else configured_pricing.prompt_per_million
-        ),
-        completion_per_million=(
-            explicit_completion_price
-            if explicit_completion_price is not None
-            else configured_pricing.completion_per_million
-        ),
+    return _merge_token_pricing(
+        base_pricing=configured_pricing,
+        prompt_price_per_million=explicit_prompt_price,
+        completion_price_per_million=explicit_completion_price,
+        cached_prompt_price_per_million=explicit_cached_prompt_price,
+        cache_creation_price_per_million=explicit_cache_creation_price,
+        cache_read_price_per_million=explicit_cache_read_price,
     )
 
 
@@ -214,6 +459,9 @@ def extract_api_cost(
     model_name_getter: ModelNameGetter | None,
     prompt_price_per_million: float | None,
     completion_price_per_million: float | None,
+    cached_prompt_price_per_million: float | None,
+    cache_creation_price_per_million: float | None,
+    cache_read_price_per_million: float | None,
     cost_extractors: Mapping[str, CostExtractor],
     model_pricing: Mapping[str, TokenPricing],
 ) -> float | None:
@@ -232,12 +480,18 @@ def extract_api_cost(
             model_name_getter=model_name_getter,
             prompt_price_per_million=prompt_price_per_million,
             completion_price_per_million=completion_price_per_million,
+            cached_prompt_price_per_million=cached_prompt_price_per_million,
+            cache_creation_price_per_million=cache_creation_price_per_million,
+            cache_read_price_per_million=cache_read_price_per_million,
             model_pricing=model_pricing,
         )
         if provider_name == OPENAI_PROVIDER:
-            return _estimate_cost(_extract_openai_token_counts(response), pricing)
+            return _estimate_openai_cost(_extract_openai_token_counts(response), pricing)
         if provider_name == ANTHROPIC_PROVIDER:
-            return _estimate_cost(_extract_anthropic_token_counts(response), pricing)
+            return _estimate_anthropic_cost(
+                _extract_anthropic_token_counts(response),
+                pricing,
+            )
 
     pricing = _resolve_token_pricing(
         response,
@@ -246,12 +500,16 @@ def extract_api_cost(
         model_name_getter=model_name_getter,
         prompt_price_per_million=prompt_price_per_million,
         completion_price_per_million=completion_price_per_million,
+        cached_prompt_price_per_million=cached_prompt_price_per_million,
+        cache_creation_price_per_million=cache_creation_price_per_million,
+        cache_read_price_per_million=cache_read_price_per_million,
         model_pricing=model_pricing,
     )
-    token_counts = _extract_openai_token_counts(response)
-    if token_counts is None:
-        token_counts = _extract_anthropic_token_counts(response)
-    return _estimate_cost(token_counts, pricing)
+    openai_token_counts = _extract_openai_token_counts(response)
+    if openai_token_counts is not None:
+        return _estimate_openai_cost(openai_token_counts, pricing)
+    anthropic_token_counts = _extract_anthropic_token_counts(response)
+    return _estimate_anthropic_cost(anthropic_token_counts, pricing)
 
 
 def _record_api_cost(
@@ -264,6 +522,9 @@ def _record_api_cost(
     model_name_getter: ModelNameGetter | None,
     prompt_price_per_million: float | None,
     completion_price_per_million: float | None,
+    cached_prompt_price_per_million: float | None,
+    cache_creation_price_per_million: float | None,
+    cache_read_price_per_million: float | None,
 ) -> None:
     try:
         from .metrics import MetricsBuilder
@@ -281,6 +542,9 @@ def _record_api_cost(
         model_name_getter=model_name_getter,
         prompt_price_per_million=prompt_price_per_million,
         completion_price_per_million=completion_price_per_million,
+        cached_prompt_price_per_million=cached_prompt_price_per_million,
+        cache_creation_price_per_million=cache_creation_price_per_million,
+        cache_read_price_per_million=cache_read_price_per_million,
     )
 
 
@@ -293,6 +557,9 @@ def track_api_cost(
     response_getter: ResponseGetter | None = None,
     prompt_price_per_million: float | None = None,
     completion_price_per_million: float | None = None,
+    cached_prompt_price_per_million: float | None = None,
+    cache_creation_price_per_million: float | None = None,
+    cache_read_price_per_million: float | None = None,
 ) -> Callable[[Callable[P, R]], Callable[P, R]]:
     normalized_source = source.strip("/")
     if not normalized_source:
@@ -315,6 +582,9 @@ async def _async_wrapper(*args: P.args, **kwargs: P.kwargs):
                     model_name_getter=model_name_getter,
                     prompt_price_per_million=prompt_price_per_million,
                     completion_price_per_million=completion_price_per_million,
+                    cached_prompt_price_per_million=cached_prompt_price_per_million,
+                    cache_creation_price_per_million=cache_creation_price_per_million,
+                    cache_read_price_per_million=cache_read_price_per_million,
                 )
                 return result
 
@@ -332,6 +602,9 @@ def _sync_wrapper(*args: P.args, **kwargs: P.kwargs):
                 model_name_getter=model_name_getter,
                 prompt_price_per_million=prompt_price_per_million,
                 completion_price_per_million=completion_price_per_million,
+                cached_prompt_price_per_million=cached_prompt_price_per_million,
+                cache_creation_price_per_million=cache_creation_price_per_million,
+                cache_read_price_per_million=cache_read_price_per_million,
             )
             return result
 
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
index 5162595f..c5f951cc 100644
--- a/tests/unit/test_track_api_cost.py
+++ b/tests/unit/test_track_api_cost.py
@@ -6,14 +6,26 @@
 import pytest
 
 from art import Model, TrainableModel, Trajectory, TrajectoryGroup
+from art.costs import compute_sample_costs, get_model_pricing
 from art.metrics import MetricsBuilder, track_api_cost
 from art.pipeline_trainer.trainer import PipelineTrainer
 
 
 class _OpenAIUsage:
-    def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
+    def __init__(
+        self,
+        prompt_tokens: int,
+        completion_tokens: int,
+        *,
+        cached_tokens: int = 0,
+    ) -> None:
         self.prompt_tokens = prompt_tokens
         self.completion_tokens = completion_tokens
+        self.prompt_tokens_details = type(
+            "PromptTokensDetails",
+            (),
+            {"cached_tokens": cached_tokens},
+        )()
 
 
 class _OpenAIResponse:
@@ -22,16 +34,30 @@ def __init__(
         prompt_tokens: int,
         completion_tokens: int,
         *,
+        cached_tokens: int = 0,
         model: str | None = None,
     ) -> None:
-        self.usage = _OpenAIUsage(prompt_tokens, completion_tokens)
+        self.usage = _OpenAIUsage(
+            prompt_tokens,
+            completion_tokens,
+            cached_tokens=cached_tokens,
+        )
         self.model = model
 
 
 class _AnthropicUsage:
-    def __init__(self, input_tokens: int, output_tokens: int) -> None:
+    def __init__(
+        self,
+        input_tokens: int,
+        output_tokens: int,
+        *,
+        cache_creation_input_tokens: int = 0,
+        cache_read_input_tokens: int = 0,
+    ) -> None:
         self.input_tokens = input_tokens
         self.output_tokens = output_tokens
+        self.cache_creation_input_tokens = cache_creation_input_tokens
+        self.cache_read_input_tokens = cache_read_input_tokens
 
 
 class _AnthropicResponse:
@@ -40,9 +66,16 @@ def __init__(
         input_tokens: int,
         output_tokens: int,
         *,
+        cache_creation_input_tokens: int = 0,
+        cache_read_input_tokens: int = 0,
         model: str | None = None,
     ) -> None:
-        self.usage = _AnthropicUsage(input_tokens, output_tokens)
+        self.usage = _AnthropicUsage(
+            input_tokens,
+            output_tokens,
+            cache_creation_input_tokens=cache_creation_input_tokens,
+            cache_read_input_tokens=cache_read_input_tokens,
+        )
         self.model = model
 
 
@@ -69,6 +102,33 @@ async def _judge() -> _OpenAIResponse:
         metrics = await builder.flush()
         assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002)
 
+    @pytest.mark.asyncio
+    async def test_openai_cost_extraction_accounts_for_cached_tokens(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        @track_api_cost(
+            source="llm_judge/cached_openai",
+            provider="openai",
+            prompt_price_per_million=2.0,
+            completion_price_per_million=8.0,
+            cached_prompt_price_per_million=0.5,
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(
+                prompt_tokens=2_000,
+                completion_tokens=100,
+                cached_tokens=1_500,
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/train/llm_judge/cached_openai"] == pytest.approx(0.00255)
+
     @pytest.mark.asyncio
     async def test_anthropic_cost_extraction_uses_registered_model_pricing(self) -> None:
         builder = MetricsBuilder(cost_context="train")
@@ -94,6 +154,126 @@ async def _judge() -> _AnthropicResponse:
         metrics = await builder.flush()
         assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062)
 
+    @pytest.mark.asyncio
+    async def test_anthropic_cost_extraction_accounts_for_cache_write_and_read(
+        self,
+    ) -> None:
+        builder = MetricsBuilder(cost_context="eval")
+        builder.register_model_pricing(
+            "anthropic/claude-sonnet-4-6",
+            prompt_per_million=3.0,
+            completion_per_million=15.0,
+            cache_creation_per_million=3.75,
+            cache_read_per_million=0.30,
+        )
+
+        @track_api_cost(
+            source="llm_judge/anthropic_cache",
+            provider="anthropic",
+            model_name="anthropic/claude-sonnet-4-6",
+        )
+        async def _judge() -> _AnthropicResponse:
+            return _AnthropicResponse(
+                input_tokens=100,
+                output_tokens=50,
+                cache_creation_input_tokens=1_000,
+                cache_read_input_tokens=500,
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/eval/llm_judge/anthropic_cache"] == pytest.approx(0.00495)
+
+    @pytest.mark.asyncio
+    async def test_response_model_name_resolves_provider_scoped_global_pricing(
+        self,
+    ) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        pricing = get_model_pricing("openai/gpt-oss-20b")
+        assert pricing is not None
+
+        @track_api_cost(source="llm_judge/global_pricing", provider="openai")
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(
+                prompt_tokens=1_000,
+                completion_tokens=2_000,
+                model="gpt-oss-20b",
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        expected = compute_sample_costs(
+            prompt_tokens=1_000,
+            completion_tokens=2_000,
+            pricing=pricing,
+        )
+        assert metrics["costs/train/llm_judge/global_pricing"] == pytest.approx(
+            expected["costs_prefill"] + expected["costs_sample"]
+        )
+
+    @pytest.mark.asyncio
+    async def test_response_model_name_resolves_provider_scoped_registered_pricing(
+        self,
+    ) -> None:
+        builder = MetricsBuilder(cost_context="eval")
+        builder.register_model_pricing(
+            "anthropic/test-judge",
+            prompt_per_million=1.5,
+            completion_per_million=2.5,
+        )
+
+        @track_api_cost(source="llm_judge/provider_resolution", provider="anthropic")
+        async def _judge() -> _AnthropicResponse:
+            return _AnthropicResponse(
+                input_tokens=400,
+                output_tokens=600,
+                model="test-judge",
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/eval/llm_judge/provider_resolution"] == pytest.approx(
+            0.0021
+        )
+
+    @pytest.mark.asyncio
+    async def test_snapshot_model_name_resolves_to_global_pricing(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        @track_api_cost(source="llm_judge/snapshot", provider="openai")
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(
+                prompt_tokens=1_000,
+                completion_tokens=100,
+                cached_tokens=800,
+                model="gpt-4.1-2025-04-14",
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        expected = ((200 * 2.0) + (800 * 0.5) + (100 * 8.0)) / 1_000_000
+        assert metrics["costs/train/llm_judge/snapshot"] == pytest.approx(expected)
+
     @pytest.mark.asyncio
     async def test_decorator_fails_fast_without_model_aware_pricing(self) -> None:
         builder = MetricsBuilder(cost_context="train")

From 9a48f2e39242b10ddfd60d6fd1b8f69c6832424d Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 10:34:58 -0700
Subject: [PATCH 37/46] test: add live API cost smoke tests

---
 pyproject.toml                          |   3 +
 tests/integration/test_live_api_cost.py | 224 ++++++++++++++++++++++++
 2 files changed, 227 insertions(+)
 create mode 100644 tests/integration/test_live_api_cost.py

diff --git a/pyproject.toml b/pyproject.toml
index af96ff76..2469677c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -116,6 +116,9 @@ force-sort-within-sections = true
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
+markers = [
+    "live_api_cost: opt-in live API cost validation against provider endpoints",
+]
 
 [tool.uv]
 required-version = ">=0.6.15"
diff --git a/tests/integration/test_live_api_cost.py b/tests/integration/test_live_api_cost.py
new file mode 100644
index 00000000..c2bd733c
--- /dev/null
+++ b/tests/integration/test_live_api_cost.py
@@ -0,0 +1,224 @@
+import json
+import os
+from pathlib import Path
+import urllib.request
+from uuid import uuid4
+
+import pytest
+
+from art import Model
+from art.metrics import track_api_cost
+
+pytestmark = pytest.mark.live_api_cost
+
+_LIVE_ENV = "ART_RUN_LIVE_API_COST_TESTS"
+
+
+def _require_live_test_env(*required_vars: str) -> None:
+    if os.environ.get(_LIVE_ENV) != "1":
+        pytest.skip(f"Set {_LIVE_ENV}=1 to run live API cost tests.")
+    missing = [name for name in required_vars if not os.environ.get(name)]
+    if missing:
+        pytest.skip(f"Missing required env vars: {', '.join(missing)}")
+
+
+def _post_json(url: str, *, headers: dict[str, str], payload: dict) -> dict:
+    request = urllib.request.Request(
+        url,
+        data=json.dumps(payload).encode("utf-8"),
+        headers=headers,
+        method="POST",
+    )
+    with urllib.request.urlopen(request, timeout=120) as response:
+        return json.loads(response.read().decode("utf-8"))
+
+
+def _cacheable_prefix(word_count: int = 1500) -> str:
+    return " ".join(f"cache-token-{index % 16}" for index in range(word_count))
+
+
+def _history_rows(history_path: Path) -> list[dict]:
+    return [json.loads(line) for line in history_path.read_text().splitlines() if line]
+
+
+def _openai_completion(*, api_key: str, prompt_cache_key: str, prefix: str) -> dict:
+    return _post_json(
+        "https://api.openai.com/v1/chat/completions",
+        headers={
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        },
+        payload={
+            "model": "gpt-4.1",
+            "messages": [
+                {"role": "system", "content": prefix},
+                {"role": "user", "content": "Reply with OK."},
+            ],
+            "temperature": 0,
+            "max_completion_tokens": 4,
+            "prompt_cache_key": prompt_cache_key,
+        },
+    )
+
+
+def _anthropic_message(*, api_key: str, prefix: str) -> dict:
+    return _post_json(
+        "https://api.anthropic.com/v1/messages",
+        headers={
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+            "content-type": "application/json",
+        },
+        payload={
+            "model": "claude-sonnet-4-6",
+            "max_tokens": 8,
+            "temperature": 0,
+            "system": [
+                {
+                    "type": "text",
+                    "text": prefix,
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+            "messages": [
+                {"role": "user", "content": "Reply with OK."},
+            ],
+        },
+    )
+
+
+class TestLiveApiCost:
+    @pytest.mark.asyncio
+    async def test_openai_gpt_4_1_cached_prompt_cost(self, tmp_path: Path) -> None:
+        _require_live_test_env("OPENAI_API_KEY")
+
+        api_key = os.environ["OPENAI_API_KEY"]
+        prefix = _cacheable_prefix()
+        prompt_cache_key = f"art-live-api-cost-{uuid4()}"
+
+        # Warm the cache first so the tracked request can validate cached pricing.
+        _openai_completion(
+            api_key=api_key,
+            prompt_cache_key=prompt_cache_key,
+            prefix=prefix,
+        )
+
+        model = Model(
+            name="live-openai-api-cost",
+            project="live-api-cost",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        @track_api_cost(
+            source="llm_judge/openai_cached_prompt",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+        )
+        def _judge() -> dict:
+            return _openai_completion(
+                api_key=api_key,
+                prompt_cache_key=prompt_cache_key,
+                prefix=prefix,
+            )
+
+        token = model.activate_metrics_context("eval")
+        try:
+            response = _judge()
+        finally:
+            token.var.reset(token)
+
+        await model.log(trajectories=None, split="val", step=1, metrics={})
+
+        usage = response["usage"]
+        cached_tokens = usage.get("prompt_tokens_details", {}).get("cached_tokens", 0)
+        assert cached_tokens > 0
+
+        expected_cost = (
+            ((usage["prompt_tokens"] - cached_tokens) * 2.0)
+            + (cached_tokens * 0.5)
+            + (usage["completion_tokens"] * 8.0)
+        ) / 1_000_000
+
+        history_path = (
+            tmp_path
+            / "live-api-cost"
+            / "models"
+            / "live-openai-api-cost"
+            / "history.jsonl"
+        )
+        row = _history_rows(history_path)[0]
+        assert row["costs/eval/llm_judge/openai_cached_prompt"] == pytest.approx(
+            expected_cost
+        )
+
+    @pytest.mark.asyncio
+    async def test_anthropic_claude_sonnet_4_6_prompt_cache_cost(
+        self,
+        tmp_path: Path,
+    ) -> None:
+        _require_live_test_env("ANTHROPIC_API_KEY")
+
+        api_key = os.environ["ANTHROPIC_API_KEY"]
+        prefix = _cacheable_prefix()
+
+        model = Model(
+            name="live-anthropic-api-cost",
+            project="live-api-cost",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        @track_api_cost(
+            source="llm_judge/anthropic_prompt_cache",
+            provider="anthropic",
+            model_name="anthropic/claude-sonnet-4-6",
+        )
+        def _judge() -> dict:
+            return _anthropic_message(api_key=api_key, prefix=prefix)
+
+        token = model.activate_metrics_context("eval")
+        try:
+            first_response = _judge()
+        finally:
+            token.var.reset(token)
+        await model.log(trajectories=None, split="val", step=1, metrics={})
+
+        token = model.activate_metrics_context("eval")
+        try:
+            second_response = _judge()
+        finally:
+            token.var.reset(token)
+        await model.log(trajectories=None, split="val", step=2, metrics={})
+
+        first_usage = first_response["usage"]
+        second_usage = second_response["usage"]
+        assert first_usage.get("cache_creation_input_tokens", 0) > 0
+        assert second_usage.get("cache_read_input_tokens", 0) > 0
+
+        first_expected_cost = (
+            (first_usage["input_tokens"] * 3.0)
+            + (first_usage.get("cache_creation_input_tokens", 0) * 3.75)
+            + (first_usage["output_tokens"] * 15.0)
+        ) / 1_000_000
+        second_expected_cost = (
+            (second_usage["input_tokens"] * 3.0)
+            + (second_usage.get("cache_read_input_tokens", 0) * 0.30)
+            + (second_usage["output_tokens"] * 15.0)
+        ) / 1_000_000
+
+        history_path = (
+            tmp_path
+            / "live-api-cost"
+            / "models"
+            / "live-anthropic-api-cost"
+            / "history.jsonl"
+        )
+        first_row, second_row = _history_rows(history_path)
+
+        assert first_row["costs/eval/llm_judge/anthropic_prompt_cache"] == pytest.approx(
+            first_expected_cost
+        )
+        assert second_row[
+            "costs/eval/llm_judge/anthropic_prompt_cache"
+        ] == pytest.approx(second_expected_cost)

From 84328cef802c633cd72d0fc61ce7797fccc96796 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 10:51:48 -0700
Subject: [PATCH 38/46] refactor: Rename API cost module

---
 src/art/{metrics_api_cost.py => api_costs.py} | 0
 src/art/metrics.py                            | 4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename src/art/{metrics_api_cost.py => api_costs.py} (100%)

diff --git a/src/art/metrics_api_cost.py b/src/art/api_costs.py
similarity index 100%
rename from src/art/metrics_api_cost.py
rename to src/art/api_costs.py
diff --git a/src/art/metrics.py b/src/art/metrics.py
index d2ba6358..4a064fda 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -7,7 +7,7 @@
 import time
 from typing import Any
 
-from .metrics_api_cost import (
+from .api_costs import (
     CostExtractor,
     ModelNameGetter,
     TokenPricing,
@@ -403,4 +403,4 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None:
                 result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds
 
 
-from .metrics_api_cost import track_api_cost
+from .api_costs import track_api_cost

From 7c0a86f810df856e52206f877f139fde5d0aa53b Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 11:05:22 -0700
Subject: [PATCH 39/46] docs: Remove metrics taxonomy smoke example

---
 docs/metrics-taxonomy.md           |  20 +----
 examples/metrics_taxonomy_smoke.py | 120 -----------------------------
 2 files changed, 2 insertions(+), 138 deletions(-)
 delete mode 100644 examples/metrics_taxonomy_smoke.py

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
index bb9ee871..9c7c7b54 100644
--- a/docs/metrics-taxonomy.md
+++ b/docs/metrics-taxonomy.md
@@ -91,16 +91,6 @@ builder.add_idle_times(step_actor_idle_s=result.actor_idle_s)
 
 If these metrics are logged before the next `model.log(...)` flush, ART will also emit the cumulative and derived throughput metrics automatically.
 
-## End-to-End Smoke Test
-
-Run:
-
-```bash
-uv run python examples/metrics_taxonomy_smoke.py
-```
-
-This writes a local history file and, if `WANDB_API_KEY` is set, logs the same metrics to W&B.
-
 ## API Cost Decorator (Phase 2/3)
 
 Use `@track_api_cost` to automatically write judge/API spend into `costs/{train|eval}/...`.
@@ -123,17 +113,11 @@ async def run_judge(client, messages):
 Activate metric cost context while running train/eval logic:
 
 ```python
-train_token = model.activate_metrics_context("train")
-try:
+with model.metrics_builder("train").activate_context():
     await run_judge(client, train_messages)
-finally:
-    train_token.var.reset(train_token)
 
-eval_token = model.activate_metrics_context("eval")
-try:
+with model.metrics_builder("eval").activate_context():
     await run_judge(client, eval_messages)
-finally:
-    eval_token.var.reset(eval_token)
 ```
 
 The next `model.log(...)` flush for that step will include:
diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py
deleted file mode 100644
index ff4d4afe..00000000
--- a/examples/metrics_taxonomy_smoke.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import asyncio
-import json
-import os
-from pathlib import Path
-import time
-
-import art
-from art.metrics import track_api_cost
-
-
-class _Usage:
-    def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
-        self.prompt_tokens = prompt_tokens
-        self.completion_tokens = completion_tokens
-
-
-class _Response:
-    def __init__(self, prompt_tokens: int, completion_tokens: int) -> None:
-        self.usage = _Usage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-        )
-
-
-@track_api_cost(
-    source="llm_judge/decorator_demo",
-    provider="openai",
-    model_name="openai/gpt-oss-20b",
-)
-async def _mock_judge_call(step: int) -> _Response:
-    return _Response(
-        prompt_tokens=50 * step,
-        completion_tokens=20 * step,
-    )
-
-
-async def main() -> None:
-    project_spec = os.environ.get("ART_METRICS_PROJECT", "metrics-taxonomy-smoke")
-    entity = os.environ.get("ART_METRICS_ENTITY")
-    project = project_spec
-    if entity is None and "/" in project_spec:
-        split_entity, split_project = project_spec.split("/", 1)
-        if split_entity and split_project:
-            entity = split_entity
-            project = split_project
-
-    model_name = os.environ.get(
-        "ART_METRICS_MODEL", f"metrics-smoke-{int(time.time())}"
-    )
-    base_path = os.environ.get("ART_METRICS_BASE_PATH", ".art")
-
-    model = art.Model(
-        name=model_name,
-        project=project,
-        entity=entity,
-        base_path=base_path,
-        report_metrics=["wandb"],
-    )
-
-    for step in (1, 2):
-        train_token = model.activate_metrics_context("train")
-        try:
-            await _mock_judge_call(step)
-        finally:
-            train_token.var.reset(train_token)
-
-        trajectories = [
-            art.TrajectoryGroup(
-                trajectories=[
-                    art.Trajectory(
-                        reward=0.4 + 0.1 * step,
-                        metrics={
-                            "judge_quality": 0.7 + 0.05 * step,
-                            "reward/custom_prefixed": 0.2 * step,
-                        },
-                        messages_and_choices=[
-                            {"role": "user", "content": f"smoke step {step}"},
-                            {"role": "assistant", "content": "ok"},
-                        ],
-                    )
-                ],
-                exceptions=[],
-            )
-        ]
-
-        await model.log(
-            trajectories,
-            split="train",
-            step=step,
-            metrics={
-                "loss/train": 1.0 / step,
-                "loss/grad_norm": 0.5 + 0.1 * step,
-                "throughput/train_tok_per_sec": 1000.0 + 100.0 * step,
-                "time/step_wall_s": 1.5 + 0.2 * step,
-                "data/step_num_scenarios": 2.0,
-                "data/step_actor_tokens": 120.0 + 10.0 * step,
-                "costs_prefill": 0.10 * step,
-                "costs_sample": 0.05 * step,
-                "costs/train/llm_judge/correctness": 0.02 * step,
-            },
-        )
-
-    history_path = Path(base_path) / project / "models" / model_name / "history.jsonl"
-    print(f"Wrote history: {history_path}")
-
-    with open(history_path) as f:
-        rows = [json.loads(line) for line in f]
-
-    print("\nLast row key excerpts:")
-    last = rows[-1]
-    show_prefixes = ("reward/", "loss/", "throughput/", "time/", "data/", "costs/")
-    for key in sorted(last):
-        if key.startswith(show_prefixes):
-            print(f"{key}: {last[key]}")
-
-    print("\nIf WANDB_API_KEY is set, metrics are also logged to W&B.")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())

From 57644a01861294fceefd776de872dfd820c51215 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 11:05:32 -0700
Subject: [PATCH 40/46] refactor: Simplify metrics cost helpers

---
 src/art/api_costs.py | 52 ++++++++++++++++++++++----------------------
 src/art/metrics.py   | 13 -----------
 2 files changed, 26 insertions(+), 39 deletions(-)

diff --git a/src/art/api_costs.py b/src/art/api_costs.py
index a98332f8..957ab38b 100644
--- a/src/art/api_costs.py
+++ b/src/art/api_costs.py
@@ -249,6 +249,21 @@ def _estimate_anthropic_cost(
     )
 
 
+def _estimate_provider_cost(
+    provider_name: str | None,
+    response: Any,
+    pricing: TokenPricing,
+) -> float | None:
+    if provider_name == OPENAI_PROVIDER:
+        return _estimate_openai_cost(_extract_openai_token_counts(response), pricing)
+    if provider_name == ANTHROPIC_PROVIDER:
+        return _estimate_anthropic_cost(
+            _extract_anthropic_token_counts(response),
+            pricing,
+        )
+    return None
+
+
 def _strip_snapshot_suffix(model_name: str) -> str:
     for pattern in (
         r"^(.*)-\d{4}-\d{2}-\d{2}$",
@@ -466,32 +481,13 @@ def extract_api_cost(
     model_pricing: Mapping[str, TokenPricing],
 ) -> float | None:
     provider_name = normalize_provider(provider) or _detect_provider(response)
-    if provider_name is not None:
-        custom_extractor = cost_extractors.get(provider_name)
-        if custom_extractor is not None:
-            custom_cost = custom_extractor(response)
-            if custom_cost is not None:
-                return float(custom_cost)
-
-        pricing = _resolve_token_pricing(
-            response,
-            provider=provider_name,
-            model_name=model_name,
-            model_name_getter=model_name_getter,
-            prompt_price_per_million=prompt_price_per_million,
-            completion_price_per_million=completion_price_per_million,
-            cached_prompt_price_per_million=cached_prompt_price_per_million,
-            cache_creation_price_per_million=cache_creation_price_per_million,
-            cache_read_price_per_million=cache_read_price_per_million,
-            model_pricing=model_pricing,
-        )
-        if provider_name == OPENAI_PROVIDER:
-            return _estimate_openai_cost(_extract_openai_token_counts(response), pricing)
-        if provider_name == ANTHROPIC_PROVIDER:
-            return _estimate_anthropic_cost(
-                _extract_anthropic_token_counts(response),
-                pricing,
-            )
+    custom_extractor = (
+        cost_extractors.get(provider_name) if provider_name is not None else None
+    )
+    if custom_extractor is not None:
+        custom_cost = custom_extractor(response)
+        if custom_cost is not None:
+            return float(custom_cost)
 
     pricing = _resolve_token_pricing(
         response,
@@ -505,6 +501,10 @@ def extract_api_cost(
         cache_read_price_per_million=cache_read_price_per_million,
         model_pricing=model_pricing,
     )
+    provider_cost = _estimate_provider_cost(provider_name, response, pricing)
+    if provider_cost is not None:
+        return provider_cost
+
     openai_token_counts = _extract_openai_token_counts(response)
     if openai_token_counts is not None:
         return _estimate_openai_cost(openai_token_counts, pricing)
diff --git a/src/art/metrics.py b/src/art/metrics.py
index 4a064fda..7be67e08 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -182,8 +182,6 @@ def measure(self, key: str):
 
     async def flush(self) -> dict[str, float]:
         async with self._shared_state.lock:
-            self._validate_hierarchy()
-
             result = dict(self._shared_state.step_buffer)
             cost_metrics = {
                 key: value
@@ -321,17 +319,6 @@ def _validate_and_add(self, key: str, value: float) -> None:
             self._shared_state.step_buffer.get(key, 0.0) + value
         )
 
-    def _validate_hierarchy(self) -> None:
-        keys = sorted(
-            k for k in self._shared_state.step_buffer if k.startswith("costs/")
-        )
-        for i, key in enumerate(keys):
-            for other in keys[i + 1 :]:
-                if other.startswith(f"{key}/"):
-                    raise ValueError(
-                        f"Leaf/parent conflict: '{key}' and '{other}' cannot coexist."
-                    )
-
     def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]:
         if not cost_metrics:
             return {}

From 401547bb693ed2b6f8544c086c86e3eee95e1596 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 11:47:47 -0700
Subject: [PATCH 41/46] refactor: Simplify metric taxonomy key handling

---
 docs/metrics-taxonomy.md                      | 46 ++++++++--------
 src/art/local/backend.py                      |  2 -
 src/art/metrics_taxonomy.py                   | 52 ++-----------------
 .../binary_prefix_tool_pipeline.py            |  2 +-
 src/art/pipeline_trainer/trainer.py           |  3 ++
 src/art/serverless/backend.py                 | 41 +++++++++++++--
 src/art/tinker_native/backend.py              | 33 ++++++++++--
 src/art/unsloth/train.py                      | 40 ++++++++++++--
 tests/unit/test_frontend_logging.py           |  4 +-
 tests/unit/test_metrics_taxonomy.py           | 29 +++++++++++
 10 files changed, 167 insertions(+), 85 deletions(-)

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
index 9c7c7b54..1da0b08f 100644
--- a/docs/metrics-taxonomy.md
+++ b/docs/metrics-taxonomy.md
@@ -12,28 +12,28 @@ Phase 1 introduces sectioned metric namespaces and hierarchical cost rollups.
 - `data/*`
 - `train/*`, `val/*`, `test/*`
 
-## Train Key Mapping
-
-Current training code emits the following canonical keys:
-
-- `reward` -> `reward/mean`
-- `reward_std_dev` -> `reward/std_dev`
-- `exception_rate` -> `reward/exception_rate`
-- `group_metric_<k>` -> `reward/group_<k>`
-- `policy_loss` / `loss` -> `loss/train`
-- `entropy` -> `loss/entropy`
-- `kl_div` -> `loss/kl_div`
-- `kl_policy_ref` -> `loss/kl_policy_ref`
-- `grad_norm` -> `loss/grad_norm`
-- `learning_rate` -> `loss/learning_rate`
-- `tokens_per_second` -> `throughput/train_tok_per_sec`
-- `num_groups_submitted` -> `train/num_groups_submitted`
-- `num_groups_trainable` -> `train/num_groups_trainable`
-- `num_trajectories` -> `train/num_trajectories`
-- `num_trainable_tokens` -> `train/num_trainable_tokens`
-- `train_tokens` -> `data/step_trainer_tokens`
-- `num_datums` -> `data/step_num_datums`
-- `num_gradient_steps` -> `data/step_num_gradient_steps`
+## Backend Output
+
+ART backends emit canonical sectioned keys directly. The canonical training keys include:
+
+- `reward/mean`
+- `reward/std_dev`
+- `reward/exception_rate`
+- `reward/group_<k>`
+- `loss/train`
+- `loss/entropy`
+- `loss/kl_div`
+- `loss/kl_policy_ref`
+- `loss/grad_norm`
+- `loss/learning_rate`
+- `throughput/train_tok_per_sec`
+- `train/num_groups_submitted`
+- `train/num_groups_trainable`
+- `train/num_trajectories`
+- `train/num_trainable_tokens`
+- `data/step_trainer_tokens`
+- `data/step_num_datums`
+- `data/step_num_gradient_steps`
 
 ## Cost Rollups
 
@@ -59,7 +59,7 @@ ART now emits the following metrics from library internals where the data is ava
 - `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer`
 - `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted`
 - `data/step_num_groups_trainable` for train splits
-- `data/cum/num_unique_scenarios` when scenario IDs are present in group or trajectory metadata
+- `data/cum/num_unique_scenarios` when `scenario_id` is present in group or trajectory metadata
 - `data/step_trainer_tokens` where the backend knows the trainer token count
 - `costs/gpu` on `LocalBackend` train-step logs when ART can resolve GPU pricing
 - `throughput/cum/trainer_idle_s`, `throughput/cum/actor_idle_s`
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index 807a0f42..d19aba32 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -48,7 +48,6 @@
     TRAIN_GRADIENT_STEPS_KEY,
     average_metric_samples,
     build_training_summary_metrics,
-    rename_train_metrics,
     summarize_trajectory_groups,
 )
 from ..model import Model, TrainableModel
@@ -789,7 +788,6 @@ async def _train_model(
         async for result in service.train(
             disk_packed_tensors, config, dev_config, verbose
         ):
-            result = rename_train_metrics(result)
             num_gradient_steps = int(
                 result.pop(TRAIN_GRADIENT_STEPS_KEY, estimated_gradient_steps)
             )
diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py
index e4f9e713..6d8adcc4 100644
--- a/src/art/metrics_taxonomy.py
+++ b/src/art/metrics_taxonomy.py
@@ -6,46 +6,9 @@
 from .trajectories import TrajectoryGroup
 
 TRAIN_GRADIENT_STEPS_KEY = "data/step_num_gradient_steps"
-
-_SCENARIO_ID_CANDIDATE_KEYS = (
-    "scenario_id",
-    "scenario_scenario_id",
-    "scenario_idx",
-    "scenario_scenario_idx",
-)
-
-TRAIN_METRIC_KEY_RENAMES = {
-    "reward": "reward/mean",
-    "reward_std_dev": "reward/std_dev",
-    "exception_rate": "reward/exception_rate",
-    "policy_loss": "loss/train",
-    "loss": "loss/train",
-    "entropy": "loss/entropy",
-    "kl_div": "loss/kl_div",
-    "kl_policy_ref": "loss/kl_policy_ref",
-    "grad_norm": "loss/grad_norm",
-    "learning_rate": "loss/learning_rate",
-    "tokens_per_second": "throughput/train_tok_per_sec",
-    "num_groups_submitted": "train/num_groups_submitted",
-    "num_groups_trainable": "train/num_groups_trainable",
-    "num_trajectories": "train/num_trajectories",
-    "num_trainable_tokens": "train/num_trainable_tokens",
-    "train_tokens": "data/step_trainer_tokens",
-    "num_datums": "data/step_num_datums",
-}
 _INVARIANT_METRIC_KEYS = frozenset({TRAIN_GRADIENT_STEPS_KEY})
 
 
-def rename_train_metric_key(metric: str) -> str:
-    if metric.startswith("group_metric_"):
-        return f"reward/group_{metric[len('group_metric_'):]}"
-    return TRAIN_METRIC_KEY_RENAMES.get(metric, metric)
-
-
-def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]:
-    return {rename_train_metric_key(key): float(value) for key, value in metrics.items()}
-
-
 def average_metric_samples(metric_samples: Iterable[dict[str, float]]) -> dict[str, float]:
     totals: dict[str, float] = {}
     counts: dict[str, int] = {}
@@ -164,14 +127,7 @@ def _extract_scenario_id(group: TrajectoryGroup) -> str | None:
 def _extract_scenario_id_from_metadata(
     metadata: dict[str, Any],
 ) -> str | None:
-    for key in _SCENARIO_ID_CANDIDATE_KEYS:
-        value = metadata.get(key)
-        if value is not None:
-            return str(value)
-
-    for key, value in metadata.items():
-        if value is None:
-            continue
-        if key.endswith("scenario_id") or key.endswith("scenario_idx"):
-            return str(value)
-    return None
+    scenario_id = metadata.get("scenario_id")
+    if scenario_id is None:
+        return None
+    return str(scenario_id)
diff --git a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
index bc2f5a04..66ed32f7 100644
--- a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
+++ b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
@@ -312,7 +312,7 @@ def build_scenario() -> Scenario:
     async def scenario_iter():
         for i in range(scenario_count):
             scenario = build_scenario()
-            scenario["metadata"] = {"scenario_idx": i}
+            scenario["metadata"] = {"scenario_id": str(i)}
             yield scenario
 
     config = PipelineConfig(
diff --git a/src/art/pipeline_trainer/trainer.py b/src/art/pipeline_trainer/trainer.py
index a32ad1b8..5d569277 100644
--- a/src/art/pipeline_trainer/trainer.py
+++ b/src/art/pipeline_trainer/trainer.py
@@ -678,6 +678,9 @@ def _apply_scenario_metadata(
                 continue
             if not self._is_scalar_metadata(value):
                 continue
+            if key == "scenario_id":
+                group.metadata["scenario_id"] = value
+                continue
             group.metadata[f"scenario_{key}"] = value
 
     def _is_group_stale(self, group: TrajectoryGroup, min_version: int) -> bool:
diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py
index d0589f7f..3c8c186e 100644
--- a/src/art/serverless/backend.py
+++ b/src/art/serverless/backend.py
@@ -14,7 +14,6 @@
     TRAIN_GRADIENT_STEPS_KEY,
     average_metric_samples,
     build_training_summary_metrics,
-    rename_train_metrics,
     summarize_trajectory_groups,
 )
 from ..trajectories import Trajectory, TrajectoryGroup
@@ -38,6 +37,42 @@ def _extract_step_from_wandb_artifact(artifact: "wandb.Artifact") -> int | None:
     return None
 
 
+_UPSTREAM_TRAIN_METRIC_KEYS = {
+    "reward": "reward/mean",
+    "reward_std_dev": "reward/std_dev",
+    "exception_rate": "reward/exception_rate",
+    "policy_loss": "loss/train",
+    "loss": "loss/train",
+    "entropy": "loss/entropy",
+    "kl_div": "loss/kl_div",
+    "kl_policy_ref": "loss/kl_policy_ref",
+    "grad_norm": "loss/grad_norm",
+    "learning_rate": "loss/learning_rate",
+    "tokens_per_second": "throughput/train_tok_per_sec",
+    "num_groups_submitted": "train/num_groups_submitted",
+    "num_groups_trainable": "train/num_groups_trainable",
+    "num_trajectories": "train/num_trajectories",
+    "num_trainable_tokens": "train/num_trainable_tokens",
+    "train_tokens": "data/step_trainer_tokens",
+    "num_datums": "data/step_num_datums",
+}
+
+
+def _canonicalize_upstream_metric_key(metric: str) -> str:
+    if "/" in metric:
+        return metric
+    if metric.startswith("group_metric_"):
+        return f"reward/group_{metric[len('group_metric_'):]}"
+    return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
+
+
+def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]:
+    return {
+        _canonicalize_upstream_metric_key(key): float(value)
+        for key, value in metrics.items()
+    }
+
+
 class ServerlessBackend(Backend):
     def __init__(
         self, *, api_key: str | None = None, base_url: str | None = None
@@ -328,7 +363,7 @@ async def _train_model(
                     assert pbar is not None and num_sequences is not None
                     pbar.update(1)
                     pbar.set_postfix(event.data)
-                    metrics = rename_train_metrics(
+                    metrics = _canonicalize_upstream_metrics(
                         {k: float(v) for k, v in event.data.items()}
                     )
                     yield {
@@ -500,7 +535,7 @@ async def _train_sft(
                     assert pbar is not None and num_batches is not None
                     pbar.update(1)
                     pbar.set_postfix(event.data)
-                    metrics = rename_train_metrics(
+                    metrics = _canonicalize_upstream_metrics(
                         {k: float(v) for k, v in event.data.items()}
                     )
                     yield {
diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py
index aeb41e1c..f8c09b25 100644
--- a/src/art/tinker_native/backend.py
+++ b/src/art/tinker_native/backend.py
@@ -32,7 +32,6 @@
 from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing
 from ..metrics_taxonomy import (
     build_training_summary_metrics,
-    rename_train_metric_key,
     summarize_trajectory_groups,
 )
 from ..model import Model, TrainableModel
@@ -52,6 +51,34 @@
 STATE_KEY_LATEST_STEP = "latest_step"
 T = TypeVar("T")
 
+_UPSTREAM_TRAIN_METRIC_KEYS = {
+    "reward": "reward/mean",
+    "reward_std_dev": "reward/std_dev",
+    "exception_rate": "reward/exception_rate",
+    "policy_loss": "loss/train",
+    "loss": "loss/train",
+    "entropy": "loss/entropy",
+    "kl_div": "loss/kl_div",
+    "kl_policy_ref": "loss/kl_policy_ref",
+    "grad_norm": "loss/grad_norm",
+    "learning_rate": "loss/learning_rate",
+    "tokens_per_second": "throughput/train_tok_per_sec",
+    "num_groups_submitted": "train/num_groups_submitted",
+    "num_groups_trainable": "train/num_groups_trainable",
+    "num_trajectories": "train/num_trajectories",
+    "num_trainable_tokens": "train/num_trainable_tokens",
+    "train_tokens": "data/step_trainer_tokens",
+    "num_datums": "data/step_num_datums",
+}
+
+
+def _canonicalize_upstream_metric_key(metric: str) -> str:
+    if "/" in metric:
+        return metric
+    if metric.startswith("group_metric_"):
+        return f"reward/group_{metric[len('group_metric_'):]}"
+    return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
+
 
 @dataclass
 class ModelState:
@@ -280,12 +307,12 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum:
             for key, value in forward_output.metrics.items():
                 if value is None:
                     continue
-                metrics[rename_train_metric_key(key)] = float(value)
+                metrics[_canonicalize_upstream_metric_key(key)] = float(value)
         if optim_output.metrics:
             for key, value in optim_output.metrics.items():
                 if value is None:
                     continue
-                metrics[rename_train_metric_key(key)] = float(value)
+                metrics[_canonicalize_upstream_metric_key(key)] = float(value)
 
         next_step = state.current_step + 1
         checkpoint_name = f"step_{next_step:06d}"
diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py
index 4b8d15d7..4505215a 100644
--- a/src/art/unsloth/train.py
+++ b/src/art/unsloth/train.py
@@ -12,7 +12,6 @@
 
 from .. import dev
 from ..loss import loss_fn, shift_tensor
-from ..metrics_taxonomy import rename_train_metric_key, rename_train_metrics
 from ..types import TrainConfig
 
 if TYPE_CHECKING:
@@ -20,6 +19,41 @@
 
 nest_asyncio.apply()
 
+_UPSTREAM_TRAIN_METRIC_KEYS = {
+    "reward": "reward/mean",
+    "reward_std_dev": "reward/std_dev",
+    "exception_rate": "reward/exception_rate",
+    "policy_loss": "loss/train",
+    "loss": "loss/train",
+    "entropy": "loss/entropy",
+    "kl_div": "loss/kl_div",
+    "kl_policy_ref": "loss/kl_policy_ref",
+    "grad_norm": "loss/grad_norm",
+    "learning_rate": "loss/learning_rate",
+    "tokens_per_second": "throughput/train_tok_per_sec",
+    "num_groups_submitted": "train/num_groups_submitted",
+    "num_groups_trainable": "train/num_groups_trainable",
+    "num_trajectories": "train/num_trajectories",
+    "num_trainable_tokens": "train/num_trainable_tokens",
+    "train_tokens": "data/step_trainer_tokens",
+    "num_datums": "data/step_num_datums",
+}
+
+
+def _canonicalize_upstream_metric_key(metric: str) -> str:
+    if "/" in metric:
+        return metric
+    if metric.startswith("group_metric_"):
+        return f"reward/group_{metric[len('group_metric_'):]}"
+    return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
+
+
+def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]:
+    return {
+        _canonicalize_upstream_metric_key(key): float(value)
+        for key, value in metrics.items()
+    }
+
 
 async def train(
     trainer: "GRPOTrainer",
@@ -198,12 +232,12 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None:
         if next(iter(logs.keys())).startswith("eval_"):
             normalized_metrics = {f"val/{key}": val for key, val in metrics.items()}
             normalized_logs = {
-                f"val/{rename_train_metric_key(key[len('eval_') :])}": val
+                f"val/{_canonicalize_upstream_metric_key(key[len('eval_') :])}": val
                 for key, val in logs.items()
             }
             results_queue.put_nowait({**normalized_metrics, **normalized_logs})
         else:
-            results_queue.put_nowait({**rename_train_metrics(logs), **metrics})
+            results_queue.put_nowait({**_canonicalize_upstream_metrics(logs), **metrics})
         trainer._metrics["train"].clear()
 
     return log
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index c0adcb40..c4b2bf9f 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -589,7 +589,7 @@ async def test_train_logs_add_default_data_metrics_from_trajectory_groups(
                         messages_and_choices=[{"role": "user", "content": "b"}],
                     ),
                 ],
-                metadata={"scenario_scenario_id": "scenario-1"},
+                metadata={"scenario_id": "scenario-1"},
             ),
             TrajectoryGroup(
                 trajectories=[
@@ -599,7 +599,7 @@ async def test_train_logs_add_default_data_metrics_from_trajectory_groups(
                     )
                 ],
                 exceptions=[],
-                metadata={"scenario_scenario_id": "scenario-2"},
+                metadata={"scenario_id": "scenario-2"},
             ),
         ]
 
diff --git a/tests/unit/test_metrics_taxonomy.py b/tests/unit/test_metrics_taxonomy.py
index 7d22ddf2..b2eaadc0 100644
--- a/tests/unit/test_metrics_taxonomy.py
+++ b/tests/unit/test_metrics_taxonomy.py
@@ -1,10 +1,12 @@
 import pytest
 
+from art import Trajectory, TrajectoryGroup
 from art.metrics_taxonomy import (
     TRAIN_GRADIENT_STEPS_KEY,
     TrajectoryBatchSummary,
     average_metric_samples,
     build_training_summary_metrics,
+    summarize_trajectory_groups,
 )
 
 
@@ -49,3 +51,30 @@ def test_average_metric_samples_requires_invariant_gradient_step_count() -> None
                 {TRAIN_GRADIENT_STEPS_KEY: 3.0},
             ]
         )
+
+
+def test_summarize_trajectory_groups_only_counts_explicit_scenario_id() -> None:
+    summary = summarize_trajectory_groups(
+        [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=1.0,
+                        messages_and_choices=[{"role": "user", "content": "a"}],
+                    )
+                ],
+                metadata={"scenario_id": "scenario-1"},
+            ),
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.0,
+                        messages_and_choices=[{"role": "user", "content": "b"}],
+                    )
+                ],
+                metadata={"scenario_scenario_id": "legacy-scenario"},
+            ),
+        ]
+    )
+
+    assert summary.scenario_ids == ["scenario-1"]

From 92384a3f4cd611fa038724b2694a5992e747500b Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 12:43:17 -0700
Subject: [PATCH 42/46] refactor: Canonicalize cost and throughput keys

---
 docs/metrics-taxonomy.md                      |  4 +--
 src/art/costs.py                              | 15 ++++++++---
 src/art/model.py                              | 25 +++++++------------
 .../binary_prefix_tool_pipeline.py            |  9 ++++---
 src/art/serverless/backend.py                 |  6 +++--
 src/art/tinker_native/backend.py              | 11 +++++---
 src/art/unsloth/service.py                    |  1 -
 src/art/unsloth/train.py                      |  6 +++--
 tests/unit/test_frontend_logging.py           | 10 ++++----
 tests/unit/test_track_api_cost.py             |  3 ++-
 10 files changed, 50 insertions(+), 40 deletions(-)

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
index 1da0b08f..72603c68 100644
--- a/docs/metrics-taxonomy.md
+++ b/docs/metrics-taxonomy.md
@@ -26,7 +26,6 @@ ART backends emit canonical sectioned keys directly. The canonical training keys
 - `loss/kl_policy_ref`
 - `loss/grad_norm`
 - `loss/learning_rate`
-- `throughput/train_tok_per_sec`
 - `train/num_groups_submitted`
 - `train/num_groups_trainable`
 - `train/num_trajectories`
@@ -37,10 +36,9 @@ ART backends emit canonical sectioned keys directly. The canonical training keys
 
 ## Cost Rollups
 
-Cost leaves can be logged with either:
+Cost leaves can be logged with hierarchical keys, for example:
 
 - hierarchical keys, e.g. `costs/train/llm_judge/correctness`
-- legacy component keys, e.g. `costs_prefill`, `costs_sample`
 
 ART rolls costs up automatically:
 
diff --git a/src/art/costs.py b/src/art/costs.py
index 5ee5523a..e3e2b2b4 100644
--- a/src/art/costs.py
+++ b/src/art/costs.py
@@ -16,7 +16,7 @@ class ModelPricing:
 
 
 TokenCount: TypeAlias = int | None
-CostCalculator: TypeAlias = Callable[[TokenCount, TokenCount], dict[str, float]]
+CostCalculator: TypeAlias = Callable[[TokenCount, TokenCount, str], dict[str, float]]
 
 # Pricing per model ($/1M tokens). Keep in sync with infra pricing.
 MODEL_PRICING: dict[str, ModelPricing] = {
@@ -88,16 +88,20 @@ def compute_sample_costs(
     *,
     prompt_tokens: int | None,
     completion_tokens: int | None,
+    cost_context: str,
     pricing: ModelPricing,
 ) -> dict[str, float]:
     """Compute prompt+completion costs for a single API call."""
+    normalized_context = cost_context.strip("/")
+    if not normalized_context:
+        raise ValueError("cost_context must be non-empty")
     prompt_value = float(prompt_tokens or 0)
     completion_value = float(completion_tokens or 0)
     prefill_cost = tokens_to_cost(prompt_value, pricing.prefill)
     sample_cost = tokens_to_cost(completion_value, pricing.sample)
     return {
-        "costs_prefill": prefill_cost,
-        "costs_sample": sample_cost,
+        f"costs/{normalized_context}/prefill": prefill_cost,
+        f"costs/{normalized_context}/sample": sample_cost,
     }
 
 
@@ -105,11 +109,14 @@ def build_cost_calculator(pricing: ModelPricing) -> CostCalculator:
     """Return a callable that computes prompt+completion costs for a request."""
 
     def _calculator(
-        prompt_tokens: int | None, completion_tokens: int | None
+        prompt_tokens: int | None,
+        completion_tokens: int | None,
+        cost_context: str,
     ) -> dict[str, float]:
         return compute_sample_costs(
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
+            cost_context=cost_context,
             pricing=pricing,
         )
 
diff --git a/src/art/model.py b/src/art/model.py
index b4d7bdef..13f8ed1a 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -36,8 +36,6 @@
 ModelConfig = TypeVar("ModelConfig", bound=BaseModel | None)
 StateType = TypeVar("StateType", bound=dict[str, Any], default=dict[str, Any])
 
-COSTS_METRIC_PREFIX = "costs_"
-COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total"
 METRICS_BUILDER_STATE_KEY = "_metrics_builder_state"
 METRIC_SECTIONS = frozenset(
     {
@@ -505,24 +503,17 @@ def _route_metrics_and_collect_non_costs(
         self, metrics: dict[str, float], split: str
     ) -> dict[str, float]:
         non_cost_metrics: dict[str, float] = {}
-        cost_context = "train" if split == "train" else "eval"
         for metric, value in metrics.items():
             numeric_value = float(value)
-            if metric == COSTS_TOTAL_KEY:
-                raise ValueError(
-                    "Do not log 'costs_total' directly. Log costs_* components "
-                    "(e.g., costs_prefill, costs_sample) and totals are derived."
-                )
             if metric.startswith("costs/"):
                 self._metrics_builder.add_cost(metric[len("costs/") :], numeric_value)
                 continue
-            if metric.startswith(COSTS_METRIC_PREFIX):
-                component = metric[len(COSTS_METRIC_PREFIX) :]
-                if component:
-                    self._metrics_builder.add_cost(
-                        f"{cost_context}/{component}", numeric_value
-                    )
-                continue
+            if metric.startswith("costs_"):
+                raise ValueError(
+                    "Legacy cost keys like 'costs_prefill' are no longer supported. "
+                    "Log hierarchical costs like 'costs/train/prefill' or "
+                    "'costs/eval/prefill' instead."
+                )
             if is_builder_managed_metric(metric):
                 self._metrics_builder.add_metric(metric, numeric_value)
                 continue
@@ -878,7 +869,9 @@ def set_cost_calculator(self, calculator: CostCalculator | None) -> None:
 
     @staticmethod
     def _noop_cost_calculator(
-        _prompt_tokens: int | None, _completion_tokens: int | None
+        _prompt_tokens: int | None,
+        _completion_tokens: int | None,
+        _cost_context: str,
     ) -> dict[str, float]:
         return {}
 
diff --git a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
index 66ed32f7..f9593c24 100644
--- a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
+++ b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
@@ -229,7 +229,9 @@ async def main() -> None:
     openai_client = model.openai_client()
     cost_calculator = model.cost_calculator
 
-    async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory:
+    async def do_rollout(
+        scenario: Scenario, temp: float, cost_context: str
+    ) -> art.Trajectory:
         """Core rollout logic used by both training and eval."""
         messages: art.Messages = scenario["messages"]
         response = await openai_client.chat.completions.create(
@@ -265,6 +267,7 @@ async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory:
         sample_costs = cost_calculator(
             prompt_tokens,
             completion_tokens,
+            cost_context,
         )
         if sample_costs:
             metrics.update(sample_costs)
@@ -281,7 +284,7 @@ async def single_rollout(
         scenario: Scenario,
         _config: PipelineConfig,
     ) -> art.Trajectory:
-        return await do_rollout(scenario, temperature)
+        return await do_rollout(scenario, temperature, "train")
 
     rollout_fn = make_group_rollout_fn(single_rollout, n=rollouts_per_scenario)
 
@@ -290,7 +293,7 @@ async def single_rollout(
     async def eval_fn(
         _model: art.TrainableModel, _step: int, _config: PipelineConfig
     ) -> list[art.Trajectory]:
-        tasks = [do_rollout(build_scenario(), eval_temperature)]
+        tasks = [do_rollout(build_scenario(), eval_temperature, "eval")]
         results = await asyncio.gather(*tasks, return_exceptions=True)
         trajectories = [r for r in results if isinstance(r, art.Trajectory)]
         if trajectories:
diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py
index 3c8c186e..a28c0127 100644
--- a/src/art/serverless/backend.py
+++ b/src/art/serverless/backend.py
@@ -48,7 +48,6 @@ def _extract_step_from_wandb_artifact(artifact: "wandb.Artifact") -> int | None:
     "kl_policy_ref": "loss/kl_policy_ref",
     "grad_norm": "loss/grad_norm",
     "learning_rate": "loss/learning_rate",
-    "tokens_per_second": "throughput/train_tok_per_sec",
     "num_groups_submitted": "train/num_groups_submitted",
     "num_groups_trainable": "train/num_groups_trainable",
     "num_trajectories": "train/num_trajectories",
@@ -61,6 +60,8 @@ def _extract_step_from_wandb_artifact(artifact: "wandb.Artifact") -> int | None:
 def _canonicalize_upstream_metric_key(metric: str) -> str:
     if "/" in metric:
         return metric
+    if metric == "tokens_per_second":
+        return ""
     if metric.startswith("group_metric_"):
         return f"reward/group_{metric[len('group_metric_'):]}"
     return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
@@ -68,8 +69,9 @@ def _canonicalize_upstream_metric_key(metric: str) -> str:
 
 def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]:
     return {
-        _canonicalize_upstream_metric_key(key): float(value)
+        canonical_key: float(value)
         for key, value in metrics.items()
+        if (canonical_key := _canonicalize_upstream_metric_key(key))
     }
 
 
diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py
index f8c09b25..9d234944 100644
--- a/src/art/tinker_native/backend.py
+++ b/src/art/tinker_native/backend.py
@@ -62,7 +62,6 @@
     "kl_policy_ref": "loss/kl_policy_ref",
     "grad_norm": "loss/grad_norm",
     "learning_rate": "loss/learning_rate",
-    "tokens_per_second": "throughput/train_tok_per_sec",
     "num_groups_submitted": "train/num_groups_submitted",
     "num_groups_trainable": "train/num_groups_trainable",
     "num_trajectories": "train/num_trajectories",
@@ -75,6 +74,8 @@
 def _canonicalize_upstream_metric_key(metric: str) -> str:
     if "/" in metric:
         return metric
+    if metric == "tokens_per_second":
+        return ""
     if metric.startswith("group_metric_"):
         return f"reward/group_{metric[len('group_metric_'):]}"
     return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
@@ -307,12 +308,16 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum:
             for key, value in forward_output.metrics.items():
                 if value is None:
                     continue
-                metrics[_canonicalize_upstream_metric_key(key)] = float(value)
+                canonical_key = _canonicalize_upstream_metric_key(key)
+                if canonical_key:
+                    metrics[canonical_key] = float(value)
         if optim_output.metrics:
             for key, value in optim_output.metrics.items():
                 if value is None:
                     continue
-                metrics[_canonicalize_upstream_metric_key(key)] = float(value)
+                canonical_key = _canonicalize_upstream_metric_key(key)
+                if canonical_key:
+                    metrics[canonical_key] = float(value)
 
         next_step = state.current_step + 1
         checkpoint_name = f"step_{next_step:06d}"
diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py
index f3a69179..cb3e3115 100644
--- a/src/art/unsloth/service.py
+++ b/src/art/unsloth/service.py
@@ -849,7 +849,6 @@ async def train_sft(
                 "loss/grad_norm": grad_norm,
                 "train/num_trajectories": float(batch.num_trajectories),
                 "train/num_trainable_tokens": float(batch.num_trainable_tokens),
-                "throughput/train_tok_per_sec": tokens_per_second,
             }
 
         # === Cleanup ===
diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py
index 4505215a..8798910b 100644
--- a/src/art/unsloth/train.py
+++ b/src/art/unsloth/train.py
@@ -30,7 +30,6 @@
     "kl_policy_ref": "loss/kl_policy_ref",
     "grad_norm": "loss/grad_norm",
     "learning_rate": "loss/learning_rate",
-    "tokens_per_second": "throughput/train_tok_per_sec",
     "num_groups_submitted": "train/num_groups_submitted",
     "num_groups_trainable": "train/num_groups_trainable",
     "num_trajectories": "train/num_trajectories",
@@ -43,6 +42,8 @@
 def _canonicalize_upstream_metric_key(metric: str) -> str:
     if "/" in metric:
         return metric
+    if metric == "tokens_per_second":
+        return ""
     if metric.startswith("group_metric_"):
         return f"reward/group_{metric[len('group_metric_'):]}"
     return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
@@ -50,8 +51,9 @@ def _canonicalize_upstream_metric_key(metric: str) -> str:
 
 def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]:
     return {
-        _canonicalize_upstream_metric_key(key): float(value)
+        canonical_key: float(value)
         for key, value in metrics.items()
+        if (canonical_key := _canonicalize_upstream_metric_key(key))
     }
 
 
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index c4b2bf9f..1f65880d 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -636,8 +636,8 @@ async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path):
             split="train",
             step=1,
             metrics={
-                "costs_prefill": 0.2,
-                "costs_sample": 0.3,
+                "costs/train/prefill": 0.2,
+                "costs/train/sample": 0.3,
             },
         )
         await model.log(
@@ -645,7 +645,7 @@ async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path):
             split="train",
             step=2,
             metrics={
-                "costs_prefill": 0.1,
+                "costs/train/prefill": 0.1,
             },
         )
 
@@ -679,7 +679,7 @@ async def test_cost_cumulative_persists_across_model_recreation(
             trajectories=None,
             split="train",
             step=1,
-            metrics={"costs_prefill": 0.25},
+            metrics={"costs/train/prefill": 0.25},
         )
 
         model_2 = Model(
@@ -692,7 +692,7 @@ async def test_cost_cumulative_persists_across_model_recreation(
             trajectories=None,
             split="train",
             step=2,
-            metrics={"costs_prefill": 0.75},
+            metrics={"costs/train/prefill": 0.75},
         )
 
         history_path = tmp_path / "test/models/test/history.jsonl"
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
index c5f951cc..675b1028 100644
--- a/tests/unit/test_track_api_cost.py
+++ b/tests/unit/test_track_api_cost.py
@@ -215,10 +215,11 @@ async def _judge() -> _OpenAIResponse:
         expected = compute_sample_costs(
             prompt_tokens=1_000,
             completion_tokens=2_000,
+            cost_context="train",
             pricing=pricing,
         )
         assert metrics["costs/train/llm_judge/global_pricing"] == pytest.approx(
-            expected["costs_prefill"] + expected["costs_sample"]
+            expected["costs/train/prefill"] + expected["costs/train/sample"]
         )
 
     @pytest.mark.asyncio

From 3b943dc5862bf1cca0fef2602fdee9237e586360 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 13:54:09 -0700
Subject: [PATCH 43/46] refactor: Require explicit API cost provider and model

---
 docs/metrics-taxonomy.md           |   6 +-
 src/art/api_costs.py               | 208 +++++++----------------------
 src/art/metrics.py                 |  10 +-
 tests/unit/test_metrics_builder.py |   2 +
 tests/unit/test_track_api_cost.py  |  46 +++++--
 5 files changed, 97 insertions(+), 175 deletions(-)

diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
index 72603c68..3bd28a11 100644
--- a/docs/metrics-taxonomy.md
+++ b/docs/metrics-taxonomy.md
@@ -130,8 +130,10 @@ Built-in usage extraction:
 - Anthropic usage (`input_tokens`, `output_tokens`)
 
 Pricing is model-aware by default. ART will use the configured model pricing from
-`art.costs.MODEL_PRICING` when it can resolve a concrete model name, and it
-raises instead of guessing when pricing is missing.
+`art.costs.MODEL_PRICING` and `art.api_costs.MODEL_TOKEN_PRICING` for an exact
+`model_name` match, and it raises instead of guessing when pricing is missing.
+`provider` and `model_name` are required on `@track_api_cost`; ART no longer
+infers them from the response payload.
 
 You can still override pricing per decorator call or register model-specific
 pricing on the builder:
diff --git a/src/art/api_costs.py b/src/art/api_costs.py
index 957ab38b..37b82b7b 100644
--- a/src/art/api_costs.py
+++ b/src/art/api_costs.py
@@ -4,7 +4,6 @@
 from dataclasses import dataclass
 from functools import wraps
 from inspect import iscoroutinefunction
-import re
 from typing import Any, ParamSpec, TypeVar
 
 from .costs import get_model_pricing, tokens_to_cost
@@ -16,7 +15,6 @@
 R = TypeVar("R")
 
 CostExtractor = Callable[[Any], float | None]
-ModelNameGetter = Callable[[Any], str | None]
 ResponseGetter = Callable[[Any], Any]
 
 
@@ -44,7 +42,7 @@ class _AnthropicTokenUsage:
     cache_read_input_tokens: float
 
 
-_DEFAULT_TOKEN_PRICING: dict[str, TokenPricing] = {
+MODEL_TOKEN_PRICING: dict[str, TokenPricing] = {
     "openai/gpt-4.1": TokenPricing(
         prompt_per_million=2.0,
         completion_per_million=8.0,
@@ -59,8 +57,8 @@ class _AnthropicTokenUsage:
 }
 
 
-def _default_token_pricing(model_name: str) -> TokenPricing | None:
-    explicit = _DEFAULT_TOKEN_PRICING.get(model_name)
+def _configured_token_pricing(model_name: str) -> TokenPricing | None:
+    explicit = MODEL_TOKEN_PRICING.get(model_name)
     if explicit is not None:
         return explicit
 
@@ -72,6 +70,7 @@ def _default_token_pricing(model_name: str) -> TokenPricing | None:
         completion_per_million=pricing.sample,
     )
 
+
 def normalize_provider(provider: str | None) -> str | None:
     if provider is None:
         return None
@@ -113,17 +112,6 @@ def _response_usage(response: Any) -> Any:
     return getattr(response, "usage", None)
 
 
-def _response_model_name(response: Any) -> str | None:
-    if isinstance(response, dict):
-        value = response.get("model")
-    else:
-        value = getattr(response, "model", None)
-    if value is None:
-        return None
-    normalized = str(value).strip()
-    return normalized or None
-
-
 def _extract_openai_token_counts(response: Any) -> _OpenAITokenUsage | None:
     usage = _response_usage(response)
     prompt_tokens = _read_usage_field(usage, "prompt_tokens")
@@ -170,24 +158,6 @@ def _extract_anthropic_token_counts(response: Any) -> _AnthropicTokenUsage | Non
     )
 
 
-def _detect_provider(response: Any) -> str | None:
-    usage = _response_usage(response)
-    if usage is None:
-        return None
-
-    if (
-        _read_usage_field(usage, "prompt_tokens") is not None
-        or _read_usage_field(usage, "completion_tokens") is not None
-    ):
-        return OPENAI_PROVIDER
-    if (
-        _read_usage_field(usage, "input_tokens") is not None
-        or _read_usage_field(usage, "output_tokens") is not None
-    ):
-        return ANTHROPIC_PROVIDER
-    return None
-
-
 def _estimate_openai_cost(
     token_counts: _OpenAITokenUsage | None,
     pricing: TokenPricing,
@@ -250,7 +220,7 @@ def _estimate_anthropic_cost(
 
 
 def _estimate_provider_cost(
-    provider_name: str | None,
+    provider_name: str,
     response: Any,
     pricing: TokenPricing,
 ) -> float | None:
@@ -260,42 +230,10 @@ def _estimate_provider_cost(
         return _estimate_anthropic_cost(
             _extract_anthropic_token_counts(response),
             pricing,
-        )
+    )
     return None
 
 
-def _strip_snapshot_suffix(model_name: str) -> str:
-    for pattern in (
-        r"^(.*)-\d{4}-\d{2}-\d{2}$",
-        r"^(.*)-\d{8}$",
-    ):
-        match = re.match(pattern, model_name)
-        if match is not None:
-            return match.group(1)
-    return model_name
-
-
-def _candidate_model_names(
-    normalized_model_name: str,
-    *,
-    provider: str | None,
-) -> list[str]:
-    candidates: list[str] = []
-
-    def _append(candidate: str | None) -> None:
-        if candidate and candidate not in candidates:
-            candidates.append(candidate)
-
-    _append(normalized_model_name)
-    _append(_strip_snapshot_suffix(normalized_model_name))
-
-    if provider is not None and "/" not in normalized_model_name:
-        _append(f"{provider}/{normalized_model_name}")
-        _append(f"{provider}/{_strip_snapshot_suffix(normalized_model_name)}")
-
-    return candidates
-
-
 def _resolve_registered_or_default_pricing(
     model_name: str,
     *,
@@ -304,7 +242,7 @@ def _resolve_registered_or_default_pricing(
     registered = model_pricing.get(model_name)
     if registered is not None:
         return registered
-    return _default_token_pricing(model_name)
+    return _configured_token_pricing(model_name)
 
 
 def _merge_token_pricing(
@@ -345,52 +283,19 @@ def _merge_token_pricing(
     )
 
 
-def _resolve_model_name(
-    response: Any,
-    *,
-    provider: str | None,
-    model_name: str | None,
-    model_name_getter: ModelNameGetter | None,
-    model_pricing: Mapping[str, TokenPricing],
-) -> str | None:
-    explicit_model_name = model_name.strip() if model_name is not None else None
-    if explicit_model_name:
-        candidate = explicit_model_name
-    elif model_name_getter is not None:
-        candidate = model_name_getter(response)
-    else:
-        candidate = _response_model_name(response)
-
-    if candidate is None:
+def normalize_model_name(model_name: str | None) -> str | None:
+    if model_name is None:
         return None
-
-    normalized_model_name = str(candidate).strip()
-    if not normalized_model_name:
+    normalized = model_name.strip()
+    if not normalized:
         return None
-
-    normalized_provider = normalize_provider(provider)
-    candidates = _candidate_model_names(
-        normalized_model_name,
-        provider=normalized_provider,
-    )
-    for candidate in candidates:
-        if _resolve_registered_or_default_pricing(
-            candidate,
-            model_pricing=model_pricing,
-        ) is not None:
-            return candidate
-
-    if normalized_provider is not None and "/" not in normalized_model_name:
-        return f"{normalized_provider}/{normalized_model_name}"
-    return normalized_model_name
+    return normalized
 
 
 def _resolve_token_pricing(
-    response: Any,
     *,
-    provider: str | None,
-    model_name: str | None,
-    model_name_getter: ModelNameGetter | None,
+    provider: str,
+    model_name: str,
     prompt_price_per_million: float | None,
     completion_price_per_million: float | None,
     cached_prompt_price_per_million: float | None,
@@ -424,36 +329,22 @@ def _resolve_token_pricing(
         else None
     )
 
-    resolved_model_name = _resolve_model_name(
-        response,
-        provider=provider,
-        model_name=model_name,
-        model_name_getter=model_name_getter,
-        model_pricing=model_pricing,
-    )
-    if resolved_model_name is None:
-        if explicit_prompt_price is not None and explicit_completion_price is not None:
-            return TokenPricing(
-                prompt_per_million=explicit_prompt_price,
-                completion_per_million=explicit_completion_price,
-                cached_prompt_per_million=explicit_cached_prompt_price,
-                cache_creation_per_million=explicit_cache_creation_price,
-                cache_read_per_million=explicit_cache_read_price,
-            )
-        raise ValueError(
-            "API cost tracking requires model-aware pricing. "
-            "Provide both explicit token prices or supply a model_name "
-            "(or response.model / model_name_getter) with configured pricing."
-        )
+    if normalize_provider(provider) is None:
+        raise ValueError("provider must be non-empty")
+
+    normalized_model_name = normalize_model_name(model_name)
+    if normalized_model_name is None:
+        raise ValueError("model_name must be non-empty")
 
     configured_pricing = _resolve_registered_or_default_pricing(
-        resolved_model_name,
+        normalized_model_name,
         model_pricing=model_pricing,
     )
     if configured_pricing is None:
         raise ValueError(
-            f"No pricing configured for model '{resolved_model_name}'. "
-            "Provide explicit token prices or register model pricing."
+            f"No pricing configured for model '{normalized_model_name}'. "
+            "Add it to art.api_costs.MODEL_TOKEN_PRICING, art.costs.MODEL_PRICING, "
+            "or register it with MetricsBuilder.register_model_pricing()."
         )
 
     return _merge_token_pricing(
@@ -469,9 +360,8 @@ def _resolve_token_pricing(
 def extract_api_cost(
     response: Any,
     *,
-    provider: str | None,
-    model_name: str | None,
-    model_name_getter: ModelNameGetter | None,
+    provider: str,
+    model_name: str,
     prompt_price_per_million: float | None,
     completion_price_per_million: float | None,
     cached_prompt_price_per_million: float | None,
@@ -480,20 +370,19 @@ def extract_api_cost(
     cost_extractors: Mapping[str, CostExtractor],
     model_pricing: Mapping[str, TokenPricing],
 ) -> float | None:
-    provider_name = normalize_provider(provider) or _detect_provider(response)
-    custom_extractor = (
-        cost_extractors.get(provider_name) if provider_name is not None else None
-    )
+    provider_name = normalize_provider(provider)
+    if provider_name is None:
+        raise ValueError("provider must be non-empty")
+
+    custom_extractor = cost_extractors.get(provider_name)
     if custom_extractor is not None:
         custom_cost = custom_extractor(response)
         if custom_cost is not None:
             return float(custom_cost)
 
     pricing = _resolve_token_pricing(
-        response,
         provider=provider_name,
         model_name=model_name,
-        model_name_getter=model_name_getter,
         prompt_price_per_million=prompt_price_per_million,
         completion_price_per_million=completion_price_per_million,
         cached_prompt_price_per_million=cached_prompt_price_per_million,
@@ -505,21 +394,23 @@ def extract_api_cost(
     if provider_cost is not None:
         return provider_cost
 
-    openai_token_counts = _extract_openai_token_counts(response)
-    if openai_token_counts is not None:
-        return _estimate_openai_cost(openai_token_counts, pricing)
-    anthropic_token_counts = _extract_anthropic_token_counts(response)
-    return _estimate_anthropic_cost(anthropic_token_counts, pricing)
+    if provider_name in {OPENAI_PROVIDER, ANTHROPIC_PROVIDER}:
+        raise ValueError(
+            f"Response usage does not match provider '{provider_name}'. "
+            "Pass the correct provider/model pair or register a custom cost extractor."
+        )
+    raise ValueError(
+        f"No cost extractor registered for provider '{provider_name}'."
+    )
 
 
 def _record_api_cost(
     *,
     result: Any,
     source: str,
-    provider: str | None,
+    provider: str,
     response_getter: ResponseGetter | None,
-    model_name: str | None,
-    model_name_getter: ModelNameGetter | None,
+    model_name: str,
     prompt_price_per_million: float | None,
     completion_price_per_million: float | None,
     cached_prompt_price_per_million: float | None,
@@ -539,7 +430,6 @@ def _record_api_cost(
         response,
         provider=provider,
         model_name=model_name,
-        model_name_getter=model_name_getter,
         prompt_price_per_million=prompt_price_per_million,
         completion_price_per_million=completion_price_per_million,
         cached_prompt_price_per_million=cached_prompt_price_per_million,
@@ -551,9 +441,8 @@ def _record_api_cost(
 def track_api_cost(
     *,
     source: str,
-    provider: str | None = None,
-    model_name: str | None = None,
-    model_name_getter: ModelNameGetter | None = None,
+    provider: str,
+    model_name: str,
     response_getter: ResponseGetter | None = None,
     prompt_price_per_million: float | None = None,
     completion_price_per_million: float | None = None,
@@ -566,6 +455,11 @@ def track_api_cost(
         raise ValueError("source must be non-empty")
 
     normalized_provider = normalize_provider(provider)
+    if normalized_provider is None:
+        raise ValueError("provider must be non-empty")
+    normalized_model_name = normalize_model_name(model_name)
+    if normalized_model_name is None:
+        raise ValueError("model_name must be non-empty")
 
     def _decorate(func: Callable[P, R]) -> Callable[P, R]:
         if iscoroutinefunction(func):
@@ -578,8 +472,7 @@ async def _async_wrapper(*args: P.args, **kwargs: P.kwargs):
                     source=normalized_source,
                     provider=normalized_provider,
                     response_getter=response_getter,
-                    model_name=model_name,
-                    model_name_getter=model_name_getter,
+                    model_name=normalized_model_name,
                     prompt_price_per_million=prompt_price_per_million,
                     completion_price_per_million=completion_price_per_million,
                     cached_prompt_price_per_million=cached_prompt_price_per_million,
@@ -598,8 +491,7 @@ def _sync_wrapper(*args: P.args, **kwargs: P.kwargs):
                 source=normalized_source,
                 provider=normalized_provider,
                 response_getter=response_getter,
-                model_name=model_name,
-                model_name_getter=model_name_getter,
+                model_name=normalized_model_name,
                 prompt_price_per_million=prompt_price_per_million,
                 completion_price_per_million=completion_price_per_million,
                 cached_prompt_price_per_million=cached_prompt_price_per_million,
diff --git a/src/art/metrics.py b/src/art/metrics.py
index 7be67e08..11d1e4d6 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -9,9 +9,9 @@
 
 from .api_costs import (
     CostExtractor,
-    ModelNameGetter,
     TokenPricing,
     extract_api_cost,
+    normalize_model_name,
     normalize_provider,
 )
 
@@ -94,9 +94,8 @@ def add_response_cost(
         source: str,
         response: Any,
         *,
-        provider: str | None = None,
-        model_name: str | None = None,
-        model_name_getter: "ModelNameGetter | None" = None,
+        provider: str,
+        model_name: str,
         prompt_price_per_million: float | None = None,
         completion_price_per_million: float | None = None,
         cached_prompt_price_per_million: float | None = None,
@@ -111,7 +110,6 @@ def add_response_cost(
             response,
             provider=provider,
             model_name=model_name,
-            model_name_getter=model_name_getter,
             prompt_price_per_million=prompt_price_per_million,
             completion_price_per_million=completion_price_per_million,
             cached_prompt_price_per_million=cached_prompt_price_per_million,
@@ -256,7 +254,7 @@ def register_model_pricing(
         cache_creation_per_million: float | None = None,
         cache_read_per_million: float | None = None,
     ) -> None:
-        normalized_model_name = model_name.strip()
+        normalized_model_name = normalize_model_name(model_name)
         if not normalized_model_name:
             raise ValueError("model_name must be non-empty")
         self._shared_state.model_pricing[normalized_model_name] = TokenPricing(
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
index 57def527..5031df14 100644
--- a/tests/unit/test_metrics_builder.py
+++ b/tests/unit/test_metrics_builder.py
@@ -197,6 +197,8 @@ async def test_add_response_cost_uses_registered_model_pricing(self) -> None:
                 "model": "anthropic/test-judge",
                 "usage": {"input_tokens": 40, "output_tokens": 60},
             },
+            provider="anthropic",
+            model_name="anthropic/test-judge",
         )
 
         metrics = await builder.flush()
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
index 675b1028..c16e5fd7 100644
--- a/tests/unit/test_track_api_cost.py
+++ b/tests/unit/test_track_api_cost.py
@@ -87,6 +87,7 @@ async def test_openai_cost_extraction_with_explicit_pricing(self) -> None:
         @track_api_cost(
             source="llm_judge/correctness",
             provider="openai",
+            model_name="openai/gpt-4.1",
             prompt_price_per_million=1.0,
             completion_price_per_million=2.0,
         )
@@ -109,6 +110,7 @@ async def test_openai_cost_extraction_accounts_for_cached_tokens(self) -> None:
         @track_api_cost(
             source="llm_judge/cached_openai",
             provider="openai",
+            model_name="openai/gpt-4.1",
             prompt_price_per_million=2.0,
             completion_price_per_million=8.0,
             cached_prompt_price_per_million=0.5,
@@ -140,6 +142,7 @@ async def test_anthropic_cost_extraction_uses_registered_model_pricing(self) ->
 
         @track_api_cost(
             source="llm_judge/faithfulness",
+            provider="anthropic",
             model_name="anthropic/test-judge",
         )
         async def _judge() -> _AnthropicResponse:
@@ -190,14 +193,18 @@ async def _judge() -> _AnthropicResponse:
         assert metrics["costs/eval/llm_judge/anthropic_cache"] == pytest.approx(0.00495)
 
     @pytest.mark.asyncio
-    async def test_response_model_name_resolves_provider_scoped_global_pricing(
+    async def test_explicit_model_name_uses_global_pricing(
         self,
     ) -> None:
         builder = MetricsBuilder(cost_context="train")
         pricing = get_model_pricing("openai/gpt-oss-20b")
         assert pricing is not None
 
-        @track_api_cost(source="llm_judge/global_pricing", provider="openai")
+        @track_api_cost(
+            source="llm_judge/global_pricing",
+            provider="openai",
+            model_name="openai/gpt-oss-20b",
+        )
         async def _judge() -> _OpenAIResponse:
             return _OpenAIResponse(
                 prompt_tokens=1_000,
@@ -223,7 +230,7 @@ async def _judge() -> _OpenAIResponse:
         )
 
     @pytest.mark.asyncio
-    async def test_response_model_name_resolves_provider_scoped_registered_pricing(
+    async def test_explicit_model_name_uses_registered_pricing(
         self,
     ) -> None:
         builder = MetricsBuilder(cost_context="eval")
@@ -233,7 +240,11 @@ async def test_response_model_name_resolves_provider_scoped_registered_pricing(
             completion_per_million=2.5,
         )
 
-        @track_api_cost(source="llm_judge/provider_resolution", provider="anthropic")
+        @track_api_cost(
+            source="llm_judge/provider_resolution",
+            provider="anthropic",
+            model_name="anthropic/test-judge",
+        )
         async def _judge() -> _AnthropicResponse:
             return _AnthropicResponse(
                 input_tokens=400,
@@ -253,10 +264,14 @@ async def _judge() -> _AnthropicResponse:
         )
 
     @pytest.mark.asyncio
-    async def test_snapshot_model_name_resolves_to_global_pricing(self) -> None:
+    async def test_explicit_model_name_does_not_depend_on_response_model(self) -> None:
         builder = MetricsBuilder(cost_context="train")
 
-        @track_api_cost(source="llm_judge/snapshot", provider="openai")
+        @track_api_cost(
+            source="llm_judge/snapshot",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+        )
         async def _judge() -> _OpenAIResponse:
             return _OpenAIResponse(
                 prompt_tokens=1_000,
@@ -279,13 +294,17 @@ async def _judge() -> _OpenAIResponse:
     async def test_decorator_fails_fast_without_model_aware_pricing(self) -> None:
         builder = MetricsBuilder(cost_context="train")
 
-        @track_api_cost(source="llm_judge/missing_pricing", provider="openai")
+        @track_api_cost(
+            source="llm_judge/missing_pricing",
+            provider="openai",
+            model_name="openai/missing-pricing-model",
+        )
         async def _judge() -> _OpenAIResponse:
             return _OpenAIResponse(prompt_tokens=10, completion_tokens=20)
 
         token = builder.activate()
         try:
-            with pytest.raises(ValueError, match="model-aware pricing"):
+            with pytest.raises(ValueError, match="No pricing configured"):
                 await _judge()
         finally:
             token.var.reset(token)
@@ -298,6 +317,7 @@ async def test_custom_extractor_takes_precedence(self) -> None:
         @track_api_cost(
             source="llm_judge/custom",
             provider="openai",
+            model_name="openai/gpt-4.1",
             prompt_price_per_million=1.0,
             completion_price_per_million=2.0,
         )
@@ -315,7 +335,11 @@ async def _judge() -> _OpenAIResponse:
 
     @pytest.mark.asyncio
     async def test_decorator_noops_without_active_builder(self) -> None:
-        @track_api_cost(source="llm_judge/no_context", provider="openai")
+        @track_api_cost(
+            source="llm_judge/no_context",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+        )
         async def _judge() -> _OpenAIResponse:
             return _OpenAIResponse(prompt_tokens=10, completion_tokens=20)
 
@@ -330,6 +354,7 @@ async def test_for_cost_context_routes_to_eval_and_shares_state(self) -> None:
         @track_api_cost(
             source="llm_judge/correctness",
             provider="openai",
+            model_name="openai/gpt-4.1",
             prompt_price_per_million=1.0,
             completion_price_per_million=2.0,
         )
@@ -359,6 +384,7 @@ async def test_model_log_emits_train_and_eval_costs(self, tmp_path: Path) -> Non
         @track_api_cost(
             source="llm_judge/correctness",
             provider="openai",
+            model_name="openai/gpt-4.1",
             prompt_price_per_million=1.0,
             completion_price_per_million=2.0,
         )
@@ -368,6 +394,7 @@ async def _train_judge() -> _OpenAIResponse:
         @track_api_cost(
             source="llm_judge/factuality",
             provider="anthropic",
+            model_name="anthropic/claude-sonnet-4-6",
             prompt_price_per_million=3.0,
             completion_price_per_million=4.0,
         )
@@ -471,6 +498,7 @@ async def test_pipeline_trainer_activates_eval_context_for_eval_fn(
         @track_api_cost(
             source="llm_judge/correctness",
             provider="openai",
+            model_name="openai/gpt-4.1",
             prompt_price_per_million=1.0,
             completion_price_per_million=2.0,
         )

From a03299439a452a3f400bf070f2488544d5a5891c Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 15:44:48 -0700
Subject: [PATCH 44/46] docs: Replace metrics taxonomy note

---
 docs/docs.json                       |   3 +-
 docs/features/tracking-metrics.mdx   | 167 +++++++++++++++++++++++++++
 docs/getting-started/quick-start.mdx |   2 +-
 docs/metrics-taxonomy.md             | 149 ------------------------
 4 files changed, 170 insertions(+), 151 deletions(-)
 create mode 100644 docs/features/tracking-metrics.mdx
 delete mode 100644 docs/metrics-taxonomy.md

diff --git a/docs/docs.json b/docs/docs.json
index 99f5675c..2b99e176 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -67,6 +67,7 @@
           "features/checkpoint-forking",
           "features/checkpoint-deletion",
           "features/additional-histories",
+          "features/tracking-metrics",
           "features/mcp-rl"
         ]
       },
@@ -106,4 +107,4 @@
     "bluesky": "https://bsky.app/profile/openpipe.bsky.social",
     "github": "https://github.com/openpipe/ART"
   }
-}
\ No newline at end of file
+}
diff --git a/docs/features/tracking-metrics.mdx b/docs/features/tracking-metrics.mdx
new file mode 100644
index 00000000..3aea84e9
--- /dev/null
+++ b/docs/features/tracking-metrics.mdx
@@ -0,0 +1,167 @@
+---
+title: "Tracking Metrics"
+description: "See what ART logs automatically and how to add your own metrics and costs."
+sidebarTitle: "Tracking Metrics"
+icon: "chart-line"
+---
+
+ART writes a metrics row every time you call `model.log(...)`. Those rows go to
+`history.jsonl` in the run directory and, if W&B logging is enabled, to W&B.
+
+Use this page for three things:
+
+- understand the metrics ART emits automatically
+- add task-specific metrics from your own rollout code
+- track external judge and API spend alongside training metrics
+
+## What ART logs automatically
+
+When you call `await model.train(...)` or `await model.log(train_groups, split="train")`,
+ART already logs most of the metrics you need to monitor a run.
+
+| Type | Examples |
+| --- | --- |
+| Reward | `reward/mean`, `reward/std_dev`, `reward/exception_rate` |
+| Loss | `loss/train`, `loss/entropy`, `loss/kl_div`, `loss/grad_norm`, `loss/learning_rate` |
+| Data | `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted`, `data/step_num_groups_trainable` |
+| Train summary | `train/num_groups_submitted`, `train/num_groups_trainable`, `train/num_trajectories` |
+| Time | `time/wall_clock_sec`, `time/step_wall_s`, `time/step_trainer_s` |
+| Cost | `costs/gpu` on `LocalBackend` when GPU pricing is known |
+
+If ART has the inputs it needs, it also derives:
+
+- cumulative metrics such as `time/cum/trainer_s`, `data/cum/num_unique_scenarios`, and `costs/cum/all`
+- cost rollups such as `costs/train`, `costs/eval`, and `costs/all`
+- throughput metrics such as `throughput/avg_trainer_tok_per_s` and `throughput/avg_actor_tok_per_s`
+
+<Note>
+  Some metrics only appear when the backend or your code provides the underlying
+  inputs. For example, `throughput/avg_actor_tok_per_s` requires both
+  `data/step_actor_tokens` and `time/step_actor_s`.
+</Note>
+
+## Add task-specific outcome metrics
+
+Attach metrics directly to each `Trajectory` when your rollout code knows whether
+an attempt succeeded, how many tools it called, or any other task-specific
+signal.
+
+```python
+async def rollout(model: art.Model, scenario: Scenario) -> art.Trajectory:
+    trajectory = art.Trajectory(
+        messages_and_choices=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": scenario.prompt},
+        ],
+        metadata={"scenario_id": scenario.id},
+    )
+
+    completion = await model.openai_client().chat.completions.create(
+        model=model.get_inference_name(),
+        messages=trajectory.messages(),
+    )
+    trajectory.messages_and_choices.append(completion.choices[0])
+
+    trajectory.reward = score_reward(trajectory)
+    trajectory.metrics["correct"] = float(is_correct(trajectory))
+    trajectory.metrics["tool_calls"] = float(count_tool_calls(trajectory))
+    return trajectory
+```
+
+On train steps, ART averages those rollout metrics and logs them under the
+`reward/` namespace, such as `reward/correct` and `reward/tool_calls`.
+
+If you want to record one value per `TrajectoryGroup` instead of one per
+trajectory, pass `metrics={...}` when you build the group. ART logs those once
+per group, using keys like `reward/group_difficulty` on train steps.
+
+## Add step-level metrics ART cannot infer
+
+Use `model.metrics_builder()` for metrics that live outside individual
+trajectories, such as actor-side timing, token counts, or idle time.
+
+```python
+builder = model.metrics_builder()
+
+with builder.measure("time/step_actor_s"):
+    result = await run_rollouts()
+
+builder.add_data(
+    step_num_scenarios=result.num_scenarios,
+    step_actor_tokens=result.actor_tokens,
+    scenario_ids=result.scenario_ids,
+)
+builder.add_idle_times(step_actor_idle_s=result.actor_idle_s)
+
+await model.log(result.train_groups, split="train", step=result.step)
+```
+
+A few useful patterns:
+
+- log `scenario_ids` to unlock `data/cum/num_unique_scenarios`
+- log both `data/step_actor_tokens` and `time/step_actor_s` to unlock actor throughput metrics
+- log `time/step_eval_s` when eval runs happen outside the backend
+- use fully qualified keys like `time/step_actor_s` or `data/step_actor_tokens` for builder-managed metrics
+
+ART flushes builder-managed metrics on the next `model.log(...)` or
+`model.train(...)` call.
+
+## Track judge and API costs
+
+Use `@track_api_cost` when a function returns a provider response object with
+token usage. Wrap the relevant part of your code in a metrics context so ART
+knows whether the spend belongs to training or evaluation.
+
+```python
+from art.metrics import track_api_cost
+
+@track_api_cost(
+    source="llm_judge/correctness",
+    provider="openai",
+    model_name="openai/gpt-4.1",
+)
+async def run_judge(client, messages):
+    return await client.chat.completions.create(
+        model="gpt-4.1",
+        messages=messages,
+    )
+
+with model.metrics_builder("train").activate_context():
+    await run_judge(judge_client, train_messages)
+
+with model.metrics_builder("eval").activate_context():
+    await run_judge(judge_client, eval_messages)
+```
+
+The next metrics row will include:
+
+- `costs/train/llm_judge/correctness` or `costs/eval/llm_judge/correctness`
+- rollups such as `costs/train`, `costs/eval`, and `costs/all`
+- cumulative totals such as `costs/cum/all`
+
+ART can price OpenAI and Anthropic responses from their usage fields. You must
+pass both `provider` and `model_name` to `@track_api_cost`.
+
+For custom pricing or unsupported models, register pricing on the builder:
+
+```python
+builder = model.metrics_builder()
+builder.register_model_pricing(
+    "anthropic/my-custom-judge",
+    prompt_per_million=1.2,
+    completion_per_million=4.8,
+)
+```
+
+## Track GPU cost on LocalBackend
+
+`LocalBackend` can log `costs/gpu` automatically on train steps. ART currently
+auto-detects H200 pricing at `$3/hour` per GPU. For other hardware, pass an
+explicit override:
+
+```python
+backend = LocalBackend(gpu_cost_per_hour_usd=2.25)
+```
+
+This lets ART include GPU spend in the same metrics stream as rewards, losses,
+and judge/API costs.
diff --git a/docs/getting-started/quick-start.mdx b/docs/getting-started/quick-start.mdx
index 58eb0ccf..63a38e02 100644
--- a/docs/getting-started/quick-start.mdx
+++ b/docs/getting-started/quick-start.mdx
@@ -38,4 +38,4 @@ At the top of the [notebook](https://colab.research.google.com/github/openpipe/a
 
 ## Step 3: Track metrics
 
-While your run progresses, observe its traces and metrics in your [W&B workspace](https://wandb.ai/home). You should start seeing some progress in the first 20-30 steps. If you have questions along the way, please ask in the [Discord](https://discord.gg/zbBHRUpwf4). Happy training!
+While your run progresses, observe its traces and metrics in your [W&B workspace](https://wandb.ai/home). You should start seeing some progress in the first 20-30 steps. For a guide to the metrics ART logs automatically and how to add your own, see [Tracking Metrics](/features/tracking-metrics). If you have questions along the way, please ask in the [Discord](https://discord.gg/zbBHRUpwf4). Happy training!
diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md
deleted file mode 100644
index 3bd28a11..00000000
--- a/docs/metrics-taxonomy.md
+++ /dev/null
@@ -1,149 +0,0 @@
-# Metrics Taxonomy (Phase 1-3)
-
-Phase 1 introduces sectioned metric namespaces and hierarchical cost rollups.
-
-## Sections
-
-- `reward/*`
-- `loss/*`
-- `throughput/*`
-- `costs/*`
-- `time/*`
-- `data/*`
-- `train/*`, `val/*`, `test/*`
-
-## Backend Output
-
-ART backends emit canonical sectioned keys directly. The canonical training keys include:
-
-- `reward/mean`
-- `reward/std_dev`
-- `reward/exception_rate`
-- `reward/group_<k>`
-- `loss/train`
-- `loss/entropy`
-- `loss/kl_div`
-- `loss/kl_policy_ref`
-- `loss/grad_norm`
-- `loss/learning_rate`
-- `train/num_groups_submitted`
-- `train/num_groups_trainable`
-- `train/num_trajectories`
-- `train/num_trainable_tokens`
-- `data/step_trainer_tokens`
-- `data/step_num_datums`
-- `data/step_num_gradient_steps`
-
-## Cost Rollups
-
-Cost leaves can be logged with hierarchical keys, for example:
-
-- hierarchical keys, e.g. `costs/train/llm_judge/correctness`
-
-ART rolls costs up automatically:
-
-- parent rollups (for example `costs/train`, `costs/all`)
-- cumulative keys under the `cum/` namespace (for example `costs/cum/all`)
-
-## Metrics Added By ART
-
-ART now emits the following metrics from library internals where the data is available:
-
-- `reward/*` aggregates from `model.log(..., split="train")`
-- `loss/*` from trainer backends
-- `time/wall_clock_sec` and `training_step` on every logged row
-- `time/step_trainer_s` for training calls
-- `time/step_wall_s` from `PipelineTrainer` and `LocalBackend` train-step logs
-- `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer`
-- `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted`
-- `data/step_num_groups_trainable` for train splits
-- `data/cum/num_unique_scenarios` when `scenario_id` is present in group or trajectory metadata
-- `data/step_trainer_tokens` where the backend knows the trainer token count
-- `costs/gpu` on `LocalBackend` train-step logs when ART can resolve GPU pricing
-- `throughput/cum/trainer_idle_s`, `throughput/cum/actor_idle_s`
-- `throughput/avg_trainer_tok_per_s`, `throughput/avg_actor_tok_per_s` when both token and time inputs are available
-
-Some metrics remain user-owned because ART cannot infer them reliably for every workflow, especially actor token usage outside the pipeline trainer.
-
-For automatic GPU cost on `LocalBackend`, ART currently auto-detects H200s at
-$3/hour per GPU. For other GPU types, pass `gpu_cost_per_hour_usd=...` to
-`LocalBackend(...)` if you want ART to emit `costs/gpu` instead of skipping it.
-
-## User Helpers
-
-Use the builder helpers for step-level metrics that only user code can know:
-
-```python
-builder = model.metrics_builder()
-
-with builder.measure("time/step_actor_s"):
-    result = await run_rollouts()
-
-builder.add_data(
-    step_actor_tokens=result.actor_tokens,
-    scenario_ids=result.scenario_ids,
-)
-
-builder.add_idle_times(step_actor_idle_s=result.actor_idle_s)
-```
-
-If these metrics are logged before the next `model.log(...)` flush, ART will also emit the cumulative and derived throughput metrics automatically.
-
-## API Cost Decorator (Phase 2/3)
-
-Use `@track_api_cost` to automatically write judge/API spend into `costs/{train|eval}/...`.
-
-```python
-from art.metrics import track_api_cost
-
-@track_api_cost(
-    source="llm_judge/correctness",
-    provider="openai",
-    model_name="openai/gpt-oss-20b",
-)
-async def run_judge(client, messages):
-    return await client.chat.completions.create(
-        model="gpt-oss-20b",
-        messages=messages,
-    )
-```
-
-Activate metric cost context while running train/eval logic:
-
-```python
-with model.metrics_builder("train").activate_context():
-    await run_judge(client, train_messages)
-
-with model.metrics_builder("eval").activate_context():
-    await run_judge(client, eval_messages)
-```
-
-The next `model.log(...)` flush for that step will include:
-
-- `costs/train/llm_judge/correctness` (or `costs/eval/...`)
-- hierarchical rollups like `costs/train`, `costs/all`
-- cumulative keys like `costs/cum/all`
-
-Built-in usage extraction:
-
-- OpenAI usage (`prompt_tokens`, `completion_tokens`)
-- Anthropic usage (`input_tokens`, `output_tokens`)
-
-Pricing is model-aware by default. ART will use the configured model pricing from
-`art.costs.MODEL_PRICING` and `art.api_costs.MODEL_TOKEN_PRICING` for an exact
-`model_name` match, and it raises instead of guessing when pricing is missing.
-`provider` and `model_name` are required on `@track_api_cost`; ART no longer
-infers them from the response payload.
-
-You can still override pricing per decorator call or register model-specific
-pricing on the builder:
-
-```python
-builder = model.metrics_builder()
-builder.register_model_pricing(
-    "anthropic/my-custom-judge",
-    prompt_per_million=1.2,
-    completion_per_million=4.8,
-)
-builder.register_cost_extractor("openai", lambda response: 0.001)  # optional custom extractor
-```

From 5e3c812365118894ef073f3c2d68085e2f74e89f Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 16:03:50 -0700
Subject: [PATCH 45/46] style: Apply ruff format

---
 dev/yes-no-maybe-metrics.py             |  4 +---
 src/art/api_costs.py                    | 10 +++-------
 src/art/local/backend.py                |  4 +---
 src/art/metrics.py                      | 21 +++++----------------
 src/art/metrics_taxonomy.py             | 13 ++++++++++---
 src/art/model.py                        | 24 ++++++++++++++++--------
 src/art/serverless/backend.py           |  2 +-
 src/art/tinker_native/backend.py        |  2 +-
 src/art/unsloth/train.py                |  6 ++++--
 tests/integration/test_live_api_cost.py |  6 +++---
 tests/unit/test_frontend_logging.py     |  6 +++---
 tests/unit/test_metric_routing.py       | 17 +++++++++--------
 tests/unit/test_metrics_builder.py      |  4 +++-
 tests/unit/test_track_api_cost.py       |  4 +++-
 14 files changed, 63 insertions(+), 60 deletions(-)

diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py
index cbb0c5fd..8bb80518 100644
--- a/dev/yes-no-maybe-metrics.py
+++ b/dev/yes-no-maybe-metrics.py
@@ -173,9 +173,7 @@ async def main() -> None:
     base_model = os.environ.get("BASE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
     project = os.environ.get("PROJECT", "yes-no-maybe-metrics")
     model = art.TrainableModel(
-        name=os.environ.get(
-            "MODEL_NAME", f"yes-no-maybe-metrics-{int(time.time())}"
-        ),
+        name=os.environ.get("MODEL_NAME", f"yes-no-maybe-metrics-{int(time.time())}"),
         project=project,
         base_model=base_model,
         report_metrics=["wandb"],
diff --git a/src/art/api_costs.py b/src/art/api_costs.py
index 37b82b7b..67fe9dbd 100644
--- a/src/art/api_costs.py
+++ b/src/art/api_costs.py
@@ -140,9 +140,7 @@ def _extract_anthropic_token_counts(response: Any) -> _AnthropicTokenUsage | Non
     cache_creation_input_tokens = (
         _read_usage_field(usage, "cache_creation_input_tokens") or 0.0
     )
-    cache_read_input_tokens = (
-        _read_usage_field(usage, "cache_read_input_tokens") or 0.0
-    )
+    cache_read_input_tokens = _read_usage_field(usage, "cache_read_input_tokens") or 0.0
     if (
         input_tokens is None
         and output_tokens is None
@@ -230,7 +228,7 @@ def _estimate_provider_cost(
         return _estimate_anthropic_cost(
             _extract_anthropic_token_counts(response),
             pricing,
-    )
+        )
     return None
 
 
@@ -399,9 +397,7 @@ def extract_api_cost(
             f"Response usage does not match provider '{provider_name}'. "
             "Pass the correct provider/model pair or register a custom cost extractor."
         )
-    raise ValueError(
-        f"No cost extractor registered for provider '{provider_name}'."
-    )
+    raise ValueError(f"No cost extractor registered for provider '{provider_name}'.")
 
 
 def _record_api_cost(
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index d2df1112..c8b0570a 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -96,9 +96,7 @@ def __init__(
         self._in_process = in_process
         self._path = path or get_default_art_path()
         self._gpu_cost_per_hour_usd = (
-            float(gpu_cost_per_hour_usd)
-            if gpu_cost_per_hour_usd is not None
-            else None
+            float(gpu_cost_per_hour_usd) if gpu_cost_per_hour_usd is not None else None
         )
         os.makedirs(self._path, exist_ok=True)
 
diff --git a/src/art/metrics.py b/src/art/metrics.py
index 11d1e4d6..eda9ab9c 100644
--- a/src/art/metrics.py
+++ b/src/art/metrics.py
@@ -236,9 +236,7 @@ def for_cost_context(self, cost_context: str) -> "MetricsBuilder":
             _shared_state=self._shared_state,
         )
 
-    def register_cost_extractor(
-        self, provider: str, extractor: CostExtractor
-    ) -> None:
+    def register_cost_extractor(self, provider: str, extractor: CostExtractor) -> None:
         normalized_provider = normalize_provider(provider)
         if normalized_provider is None:
             raise ValueError("provider must be non-empty")
@@ -356,16 +354,9 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None:
             self._shared_state.cum_state[cum_key] = next_value
             result[cum_key] = next_value
 
-        if (
-            "data/step_trainer_tokens" in result
-            or "time/step_trainer_s" in result
-        ):
-            trainer_tokens = self._shared_state.cum_state.get(
-                "data/cum/trainer_tokens"
-            )
-            trainer_seconds = self._shared_state.cum_state.get(
-                "time/cum/trainer_s"
-            )
+        if "data/step_trainer_tokens" in result or "time/step_trainer_s" in result:
+            trainer_tokens = self._shared_state.cum_state.get("data/cum/trainer_tokens")
+            trainer_seconds = self._shared_state.cum_state.get("time/cum/trainer_s")
             if (
                 trainer_tokens is not None
                 and trainer_seconds is not None
@@ -376,9 +367,7 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None:
                 )
 
         if "data/step_actor_tokens" in result or "time/step_actor_s" in result:
-            actor_tokens = self._shared_state.cum_state.get(
-                "data/cum/actor_tokens"
-            )
+            actor_tokens = self._shared_state.cum_state.get("data/cum/actor_tokens")
             actor_seconds = self._shared_state.cum_state.get("time/cum/actor_s")
             if (
                 actor_tokens is not None
diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py
index 6d8adcc4..6965b68d 100644
--- a/src/art/metrics_taxonomy.py
+++ b/src/art/metrics_taxonomy.py
@@ -9,7 +9,9 @@
 _INVARIANT_METRIC_KEYS = frozenset({TRAIN_GRADIENT_STEPS_KEY})
 
 
-def average_metric_samples(metric_samples: Iterable[dict[str, float]]) -> dict[str, float]:
+def average_metric_samples(
+    metric_samples: Iterable[dict[str, float]],
+) -> dict[str, float]:
     totals: dict[str, float] = {}
     counts: dict[str, int] = {}
     invariant_values: dict[str, float] = {}
@@ -65,7 +67,9 @@ def summarize_trajectory_groups(
 
     return TrajectoryBatchSummary(
         num_scenarios=len(groups),
-        num_trajectories=sum(len(group.trajectories) + len(group.exceptions) for group in groups),
+        num_trajectories=sum(
+            len(group.trajectories) + len(group.exceptions) for group in groups
+        ),
         num_groups_submitted=len(groups),
         num_groups_trainable=sum(1 for group in groups if _group_is_trainable(group)),
         scenario_ids=scenario_ids,
@@ -117,7 +121,10 @@ def _group_is_trainable(group: TrajectoryGroup) -> bool:
 
 
 def _extract_scenario_id(group: TrajectoryGroup) -> str | None:
-    for metadata in [group.metadata, *(trajectory.metadata for trajectory in group.trajectories)]:
+    for metadata in [
+        group.metadata,
+        *(trajectory.metadata for trajectory in group.trajectories),
+    ]:
         scenario_id = _extract_scenario_id_from_metadata(metadata)
         if scenario_id is not None:
             return scenario_id
diff --git a/src/art/model.py b/src/art/model.py
index 13f8ed1a..a5ea06c0 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -153,9 +153,13 @@ def __init__(
         object.__setattr__(self, "_wandb_defined_metrics", set())
         object.__setattr__(self, "_run_start_time", time.time())
         object.__setattr__(self, "_run_start_monotonic", time.monotonic())
-        object.__setattr__(self, "_last_local_train_log_monotonic", self._run_start_monotonic)
+        object.__setattr__(
+            self, "_last_local_train_log_monotonic", self._run_start_monotonic
+        )
         object.__setattr__(self, "_last_local_train_step", None)
-        object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train"))
+        object.__setattr__(
+            self, "_metrics_builder", MetricsBuilder(cost_context="train")
+        )
         object.__setattr__(self, "_metrics_builder_state_loaded", False)
 
     @overload
@@ -548,7 +552,9 @@ def _collect_automatic_backend_metrics(
         if "time/step_wall_s" not in provided_metric_keys:
             automatic_metrics["time/step_wall_s"] = step_wall_s
 
-        gpu_cost_getter = getattr(self._backend, "automatic_gpu_cost_per_hour_usd", None)
+        gpu_cost_getter = getattr(
+            self._backend, "automatic_gpu_cost_per_hour_usd", None
+        )
         if callable(gpu_cost_getter) and "costs/gpu" not in provided_metric_keys:
             gpu_cost_per_hour_usd = gpu_cost_getter(self)
             if gpu_cost_per_hour_usd is not None:
@@ -597,9 +603,7 @@ def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder:
             return self._metrics_builder
         return self._metrics_builder.for_cost_context(cost_context)
 
-    def activate_metrics_context(
-        self, cost_context: str
-    ) -> Token[MetricsBuilder]:
+    def activate_metrics_context(self, cost_context: str) -> Token[MetricsBuilder]:
         return self.metrics_builder(cost_context).activate()
 
     def _load_metrics_builder_state(self) -> None:
@@ -777,7 +781,9 @@ async def log(
         for metric, values in group_metrics.items():
             if len(values) > 0:
                 group_key = (
-                    f"reward/group_{metric}" if split == "train" else f"group_metric_{metric}"
+                    f"reward/group_{metric}"
+                    if split == "train"
+                    else f"group_metric_{metric}"
                 )
                 averages[group_key] = sum(values) / len(values)
 
@@ -1080,4 +1086,6 @@ async def train_sft(
             avg_metrics["time/step_trainer_s"] = trainer_elapsed
             # Get the current step after training
             step = await self.get_step()
-            await self.log(trajectories=None, split="train", metrics=avg_metrics, step=step)
+            await self.log(
+                trajectories=None, split="train", metrics=avg_metrics, step=step
+            )
diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py
index 4bb4d5f3..fcb9f68f 100644
--- a/src/art/serverless/backend.py
+++ b/src/art/serverless/backend.py
@@ -63,7 +63,7 @@ def _canonicalize_upstream_metric_key(metric: str) -> str:
     if metric == "tokens_per_second":
         return ""
     if metric.startswith("group_metric_"):
-        return f"reward/group_{metric[len('group_metric_'):]}"
+        return f"reward/group_{metric[len('group_metric_') :]}"
     return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
 
 
diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py
index 9d234944..500a850f 100644
--- a/src/art/tinker_native/backend.py
+++ b/src/art/tinker_native/backend.py
@@ -77,7 +77,7 @@ def _canonicalize_upstream_metric_key(metric: str) -> str:
     if metric == "tokens_per_second":
         return ""
     if metric.startswith("group_metric_"):
-        return f"reward/group_{metric[len('group_metric_'):]}"
+        return f"reward/group_{metric[len('group_metric_') :]}"
     return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
 
 
diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py
index 2a58879b..d93569d4 100644
--- a/src/art/unsloth/train.py
+++ b/src/art/unsloth/train.py
@@ -45,7 +45,7 @@ def _canonicalize_upstream_metric_key(metric: str) -> str:
     if metric == "tokens_per_second":
         return ""
     if metric.startswith("group_metric_"):
-        return f"reward/group_{metric[len('group_metric_'):]}"
+        return f"reward/group_{metric[len('group_metric_') :]}"
     return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
 
 
@@ -237,7 +237,9 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None:
             }
             results_queue.put_nowait({**normalized_metrics, **normalized_logs})
         else:
-            results_queue.put_nowait({**_canonicalize_upstream_metrics(logs), **metrics})
+            results_queue.put_nowait(
+                {**_canonicalize_upstream_metrics(logs), **metrics}
+            )
         trainer._metrics["train"].clear()
 
     return log
diff --git a/tests/integration/test_live_api_cost.py b/tests/integration/test_live_api_cost.py
index c2bd733c..ad7438be 100644
--- a/tests/integration/test_live_api_cost.py
+++ b/tests/integration/test_live_api_cost.py
@@ -216,9 +216,9 @@ def _judge() -> dict:
         )
         first_row, second_row = _history_rows(history_path)
 
-        assert first_row["costs/eval/llm_judge/anthropic_prompt_cache"] == pytest.approx(
-            first_expected_cost
-        )
+        assert first_row[
+            "costs/eval/llm_judge/anthropic_prompt_cache"
+        ] == pytest.approx(first_expected_cost)
         assert second_row[
             "costs/eval/llm_judge/anthropic_prompt_cache"
         ] == pytest.approx(second_expected_cost)
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 1f65880d..2afb8af6 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -349,9 +349,9 @@ async def test_metric_prefixes(self, tmp_path: Path):
                 "time/wall_clock_sec",
             ]
         ]
-        assert all(
-            k.startswith(("val/", "data/")) for k in metric_keys
-        ), f"Not all metrics routed into taxonomy namespaces: {metric_keys}"
+        assert all(k.startswith(("val/", "data/")) for k in metric_keys), (
+            f"Not all metrics routed into taxonomy namespaces: {metric_keys}"
+        )
         assert entry["training_step"] == 0
         assert entry["time/wall_clock_sec"] >= 0
 
diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py
index f9904527..8f6cad92 100644
--- a/tests/unit/test_metric_routing.py
+++ b/tests/unit/test_metric_routing.py
@@ -58,8 +58,7 @@ def test_get_wandb_run_registers_taxonomy_sections(self, tmp_path: Path) -> None
 
         assert run is fake_run
         define_calls = [
-            (call.args, call.kwargs)
-            for call in fake_wandb.define_metric.call_args_list
+            (call.args, call.kwargs) for call in fake_wandb.define_metric.call_args_list
         ]
         assert define_calls == [
             (("training_step",), {}),
@@ -104,14 +103,16 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step(
                 )
 
         define_calls = [
-            (call.args, call.kwargs)
-            for call in fake_wandb.define_metric.call_args_list
+            (call.args, call.kwargs) for call in fake_wandb.define_metric.call_args_list
         ]
-        assert (("costs/train/sample",), {"step_metric": "training_step"}) in define_calls
         assert (
-            (("costs/cum/train/prefill",), {"step_metric": "training_step"})
-            in define_calls
-        )
+            ("costs/train/sample",),
+            {"step_metric": "training_step"},
+        ) in define_calls
+        assert (
+            ("costs/cum/train/prefill",),
+            {"step_metric": "training_step"},
+        ) in define_calls
         fake_run.log.assert_called_once()
         logged_metrics = fake_run.log.call_args.args[0]
         assert logged_metrics["costs/train/sample"] == 0.1
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
index 5031df14..dfa24a11 100644
--- a/tests/unit/test_metrics_builder.py
+++ b/tests/unit/test_metrics_builder.py
@@ -79,7 +79,9 @@ async def test_helper_metrics_accumulate_within_a_single_step(self) -> None:
         assert metrics["throughput/step_actor_idle_s"] == pytest.approx(3.0)
 
     @pytest.mark.asyncio
-    async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) -> None:
+    async def test_throughput_metrics_derive_from_time_and_token_cumulatives(
+        self,
+    ) -> None:
         builder = MetricsBuilder(cost_context="train")
 
         builder.add_metric("time/step_trainer_s", 4.0)
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
index c16e5fd7..80553e48 100644
--- a/tests/unit/test_track_api_cost.py
+++ b/tests/unit/test_track_api_cost.py
@@ -132,7 +132,9 @@ async def _judge() -> _OpenAIResponse:
         assert metrics["costs/train/llm_judge/cached_openai"] == pytest.approx(0.00255)
 
     @pytest.mark.asyncio
-    async def test_anthropic_cost_extraction_uses_registered_model_pricing(self) -> None:
+    async def test_anthropic_cost_extraction_uses_registered_model_pricing(
+        self,
+    ) -> None:
         builder = MetricsBuilder(cost_context="train")
         builder.register_model_pricing(
             "anthropic/test-judge",

From b934c25270474ebac9052021cae6faa67aff6152 Mon Sep 17 00:00:00 2001
From: Vivek Kalyan <hello@vivekkalyan.com>
Date: Tue, 10 Mar 2026 16:15:17 -0700
Subject: [PATCH 46/46] fix: Resolve ty failures in API cost and Unsloth

---
 src/art/api_costs.py       | 13 +++++++------
 src/art/unsloth/service.py | 19 +++++++++++++++----
 src/art/unsloth/train.py   |  4 ++--
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/art/api_costs.py b/src/art/api_costs.py
index 67fe9dbd..1bbd9ed9 100644
--- a/src/art/api_costs.py
+++ b/src/art/api_costs.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 
-from collections.abc import Callable, Mapping
+from collections.abc import Awaitable, Callable, Mapping
 from dataclasses import dataclass
 from functools import wraps
 from inspect import iscoroutinefunction
-from typing import Any, ParamSpec, TypeVar
+from typing import Any, ParamSpec, TypeVar, cast
 
 from .costs import get_model_pricing, tokens_to_cost
 
@@ -459,10 +459,11 @@ def track_api_cost(
 
     def _decorate(func: Callable[P, R]) -> Callable[P, R]:
         if iscoroutinefunction(func):
+            async_func = cast(Callable[P, Awaitable[Any]], func)
 
             @wraps(func)
-            async def _async_wrapper(*args: P.args, **kwargs: P.kwargs):
-                result = await func(*args, **kwargs)
+            async def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
+                result = await async_func(*args, **kwargs)
                 _record_api_cost(
                     result=result,
                     source=normalized_source,
@@ -477,10 +478,10 @@ async def _async_wrapper(*args: P.args, **kwargs: P.kwargs):
                 )
                 return result
 
-            return _async_wrapper
+            return cast(Callable[P, R], _async_wrapper)
 
         @wraps(func)
-        def _sync_wrapper(*args: P.args, **kwargs: P.kwargs):
+        def _sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
             result = func(*args, **kwargs)
             _record_api_cost(
                 result=result,
diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py
index 9d16bf7d..76ab1991 100644
--- a/src/art/unsloth/service.py
+++ b/src/art/unsloth/service.py
@@ -13,6 +13,7 @@
 from datasets import Dataset
 import peft
 import torch
+from torch.optim import Optimizer
 from transformers import GenerationMixin, PreTrainedModel
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from trl import GRPOConfig, GRPOTrainer
@@ -190,6 +191,13 @@ def save_checkpoint(
     return checkpoint_dir
 
 
+def _get_trainer_optimizer(trainer: GRPOTrainer) -> Optimizer:
+    optimizer = cast(Optimizer | None, getattr(trainer, "optimizer", None))
+    if optimizer is None:
+        raise RuntimeError("Trainer optimizer must be initialized before training")
+    return optimizer
+
+
 # ============================================================================
 # Model Classes
 # ============================================================================
@@ -541,10 +549,11 @@ def _reset_optimizer_if_mode_changed(
         mode_changed = (
             self._last_training_mode is not None and self._last_training_mode != mode
         )
+        optimizer = _get_trainer_optimizer(self._state.trainer)
 
         if mode_changed:
             # Clear all optimizer state (exp_avg, exp_avg_sq, step for each param)
-            self._state.trainer.optimizer.state.clear()
+            optimizer.state.clear()
 
         self._last_training_mode = mode
 
@@ -576,9 +585,10 @@ async def _train_dedicated(
     ) -> AsyncIterator[dict[str, float]]:
         """Train in dedicated mode — no sleep/wake, vLLM keeps running on separate GPU."""
         self._reset_optimizer_if_mode_changed("rl")
+        optimizer = _get_trainer_optimizer(self._state.trainer)
 
         rl_weight_decay = 0.1
-        for param_group in self._state.trainer.optimizer.param_groups:
+        for param_group in optimizer.param_groups:
             param_group["weight_decay"] = rl_weight_decay
 
         packed_tensors = packed_tensors_from_dir(**disk_packed_tensors)
@@ -661,10 +671,11 @@ async def _train_shared(
 
         # Reset optimizer state if switching from SFT to RL
         self._reset_optimizer_if_mode_changed("rl")
+        optimizer = _get_trainer_optimizer(self._state.trainer)
 
         # Set RL-specific hyperparameters
         rl_weight_decay = 0.1
-        for param_group in self._state.trainer.optimizer.param_groups:
+        for param_group in optimizer.param_groups:
             param_group["weight_decay"] = rl_weight_decay
 
         # Load packed tensors
@@ -794,7 +805,7 @@ async def train_sft(
         # Get model and optimizer
         peft_model = self._state.peft_model
         self._reset_optimizer_if_mode_changed("sft")
-        optimizer = self._state.trainer.optimizer
+        optimizer = _get_trainer_optimizer(self._state.trainer)
 
         # Set SFT-specific hyperparameters
         sft_weight_decay = 0.01
diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py
index d93569d4..399c1c72 100644
--- a/src/art/unsloth/train.py
+++ b/src/art/unsloth/train.py
@@ -3,7 +3,7 @@
 from contextlib import contextmanager, nullcontext
 import gc
 import os
-from typing import TYPE_CHECKING, Callable, cast
+from typing import TYPE_CHECKING, Any, Callable, cast
 
 import nest_asyncio
 from peft.peft_model import PeftModel
@@ -220,7 +220,7 @@ def compute_loss(
 
 
 def get_log_fn(
-    trainer: "GRPOTrainer", results_queue: asyncio.Queue[dict[str, float]]
+    trainer: Any, results_queue: asyncio.Queue[dict[str, float]]
 ) -> Callable[..., None]:
     def log(logs: dict[str, float], start_time: float | None = None) -> None:
         metrics = {