diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py
new file mode 100644
index 000000000..8bb80518d
--- /dev/null
+++ b/dev/yes-no-maybe-metrics.py
@@ -0,0 +1,259 @@
+"""Yes-no-maybe metrics demo for the LocalBackend `model.train()` path.
+
+This keeps the same prompt family, rollout structure, and reward ordering as
+`dev/yes-no-maybe.py` while adding explicit metrics taxonomy instrumentation for
+actor/eval timing and data metrics, while relying on LocalBackend for automatic
+step wall time and GPU cost logging.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from itertools import permutations
+import os
+import time
+
+from dotenv import load_dotenv
+import openai
+
+try:
+    import unsloth  # noqa: F401
+except ImportError:
+    pass
+
+import art
+from art.local import LocalBackend
+
+
+async def create_chat_completion(
+    client: openai.AsyncOpenAI,
+    *,
+    model_name: str,
+    messages: art.Messages,
+    max_tokens: int,
+    timeout: float,
+) -> openai.types.chat.chat_completion.ChatCompletion:
+    return await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        max_tokens=max_tokens,
+        timeout=timeout,
+    )
+
+
+def with_quotes(word: str) -> str:
+    return f"'{word}'"
+
+
+def build_prompts() -> list[str]:
+    return [
+        f"{prefix} with {', '.join([with_quotes(word) if use_quotes else word for word in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}"
+        for prefix in ["respond", "just respond"]
+        for use_quotes in [True, False]
+        for words in (
+            list(permutation)
+            for length in [3, 2]
+            for permutation in permutations(["yes", "no", "maybe"], length)
+        )
+    ]
+
+
+def reward_for_answer(content: str | None) -> float:
+    if content == "yes":
+        return 0.5
+    if content == "no":
+        return 0.75
+    if content == "maybe":
+        return 1.0
+    return 0.0
+
+
+def scenario_id_for_prompt(prompt: str) -> str:
+    return prompt.replace(" ", "_").replace("'", "")
+
+
+def response_total_tokens(
+    response: openai.types.chat.chat_completion.ChatCompletion,
+) -> int:
+    usage = response.usage
+    if usage is None:
+        return 0
+    prompt_tokens = int(usage.prompt_tokens or 0)
+    completion_tokens = int(usage.completion_tokens or 0)
+    return prompt_tokens + completion_tokens
+
+
+def total_actor_tokens(groups: list[art.TrajectoryGroup]) -> int:
+    return sum(
+        int(trajectory.metadata.get("actor_total_tokens", 0) or 0)
+        for group in groups
+        for trajectory in group.trajectories
+    )
+
+
+async def rollout(
+    client: openai.AsyncOpenAI,
+    model: art.TrainableModel,
+    prompt: str,
+    *,
+    max_tokens: int,
+    timeout: float,
+) -> art.Trajectory:
+    messages: art.Messages = [{"role": "user", "content": prompt}]
+    chat_completion = await create_chat_completion(
+        client,
+        model_name=model.get_inference_name(),
+        messages=messages,
+        max_tokens=max_tokens,
+        timeout=timeout,
+    )
+    choice = chat_completion.choices[0]
+    content = choice.message.content
+    return art.Trajectory(
+        messages_and_choices=[*messages, choice],
+        reward=reward_for_answer(content),
+        metadata={
+            "scenario_id": scenario_id_for_prompt(prompt),
+            "actor_total_tokens": response_total_tokens(chat_completion),
+        },
+        metrics={
+            "valid_answer": reward_for_answer(content) > 0.0,
+        },
+    )
+
+
+async def evaluate(
+    client: openai.AsyncOpenAI,
+    model: art.TrainableModel,
+    prompts: list[str],
+    *,
+    max_tokens: int,
+    timeout: float,
+) -> list[art.TrajectoryGroup]:
+    groups = await art.gather_trajectory_groups(
+        art.TrajectoryGroup(
+            [
+                rollout(
+                    client,
+                    model,
+                    prompt,
+                    max_tokens=max_tokens,
+                    timeout=timeout,
+                )
+            ],
+            metadata={"scenario_id": scenario_id_for_prompt(prompt)},
+        )
+        for prompt in prompts
+    )
+    return groups
+
+
+def print_history_summary(model: art.TrainableModel) -> None:
+    history_path = (
+        model.base_path + f"/{model.project}/models/{model.name}/history.jsonl"
+    )
+    print(f"History: {history_path}")
+
+
+def build_internal_config() -> art.dev.InternalModelConfig:
+    return art.dev.InternalModelConfig(
+        engine_args=art.dev.EngineArgs(
+            gpu_memory_utilization=float(
+                os.environ.get("GPU_MEMORY_UTILIZATION", "0.85")
+            ),
+            max_model_len=int(os.environ.get("MAX_MODEL_LEN", "4096")),
+        )
+    )
+
+
+async def main() -> None:
+    load_dotenv()
+
+    backend = LocalBackend()
+    base_model = os.environ.get("BASE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
+    project = os.environ.get("PROJECT", "yes-no-maybe-metrics")
+    model = art.TrainableModel(
+        name=os.environ.get("MODEL_NAME", f"yes-no-maybe-metrics-{int(time.time())}"),
+        project=project,
+        base_model=base_model,
+        report_metrics=["wandb"],
+        _internal_config=build_internal_config(),
+    )
+    try:
+        await model.register(backend)
+
+        prompts = build_prompts()
+        eval_prompts = prompts[: int(os.environ.get("EVAL_PROMPTS", "12"))]
+        openai_client = model.openai_client()
+        max_steps = int(os.environ.get("NUM_STEPS", "20"))
+        rollouts_per_prompt = int(os.environ.get("ROLLOUTS_PER_PROMPT", "32"))
+        max_tokens = int(os.environ.get("MAX_TOKENS", "100"))
+        timeout = float(os.environ.get("TIMEOUT", "100"))
+        eval_every_n_steps = int(os.environ.get("EVAL_EVERY_N_STEPS", "1"))
+        learning_rate = float(os.environ.get("LEARNING_RATE", "1e-4"))
+
+        start_step = await model.get_step()
+        for offset in range(max_steps):
+            current_step = start_step + offset
+
+            if (
+                eval_every_n_steps > 0
+                and (current_step - start_step) % eval_every_n_steps == 0
+            ):
+                eval_builder = model.metrics_builder("eval")
+                with eval_builder.activate_context():
+                    with eval_builder.measure("time/step_eval_s"):
+                        val_groups = await evaluate(
+                            openai_client,
+                            model,
+                            eval_prompts,
+                            max_tokens=max_tokens,
+                            timeout=timeout,
+                        )
+                    eval_builder.add_data(
+                        step_actor_tokens=total_actor_tokens(val_groups)
+                    )
+                await model.log(val_groups, split="val", step=current_step)
+
+            train_builder = model.metrics_builder("train")
+            with train_builder.activate_context():
+                with train_builder.measure("time/step_actor_s"):
+                    train_groups = await art.gather_trajectory_groups(
+                        (
+                            art.TrajectoryGroup(
+                                rollout(
+                                    openai_client,
+                                    model,
+                                    prompt,
+                                    max_tokens=max_tokens,
+                                    timeout=timeout,
+                                )
+                                for _ in range(rollouts_per_prompt)
+                            )
+                            for prompt in prompts
+                        )
+                    )
+                train_builder.add_data(
+                    step_actor_tokens=total_actor_tokens(train_groups)
+                )
+                result = await backend.train(
+                    model,
+                    train_groups,
+                    learning_rate=learning_rate,
+                )
+
+            await model.log(
+                split="train",
+                step=result.step,
+                trajectories=train_groups,
+                metrics=result.metrics,
+            )
+            print(f"step {result.step} complete")
+
+        print_history_summary(model)
+    finally:
+        await backend.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/docs.json b/docs/docs.json
index 99f5675c7..2b99e176e 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -67,6 +67,7 @@
           "features/checkpoint-forking",
           "features/checkpoint-deletion",
           "features/additional-histories",
+          "features/tracking-metrics",
           "features/mcp-rl"
         ]
       },
@@ -106,4 +107,4 @@
     "bluesky": "https://bsky.app/profile/openpipe.bsky.social",
     "github": "https://github.com/openpipe/ART"
   }
-}
\ No newline at end of file
+}
diff --git a/docs/features/tracking-metrics.mdx b/docs/features/tracking-metrics.mdx
new file mode 100644
index 000000000..3aea84e98
--- /dev/null
+++ b/docs/features/tracking-metrics.mdx
@@ -0,0 +1,167 @@
+---
+title: "Tracking Metrics"
+description: "See what ART logs automatically and how to add your own metrics and costs."
+sidebarTitle: "Tracking Metrics"
+icon: "chart-line"
+---
+
+ART writes a metrics row every time you call `model.log(...)`. Those rows go to
+`history.jsonl` in the run directory and, if W&B logging is enabled, to W&B.
+
+Use this page for three things:
+
+- understand the metrics ART emits automatically
+- add task-specific metrics from your own rollout code
+- track external judge and API spend alongside training metrics
+
+## What ART logs automatically
+
+When you call `await model.train(...)` or `await model.log(train_groups, split="train")`,
+ART already logs most of the metrics you need to monitor a run.
+
+| Type | Examples |
+| --- | --- |
+| Reward | `reward/mean`, `reward/std_dev`, `reward/exception_rate` |
+| Loss | `loss/train`, `loss/entropy`, `loss/kl_div`, `loss/grad_norm`, `loss/learning_rate` |
+| Data | `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted`, `data/step_num_groups_trainable` |
+| Train summary | `train/num_groups_submitted`, `train/num_groups_trainable`, `train/num_trajectories` |
+| Time | `time/wall_clock_sec`, `time/step_wall_s`, `time/step_trainer_s` |
+| Cost | `costs/gpu` on `LocalBackend` when GPU pricing is known |
+
+If ART has the inputs it needs, it also derives:
+
+- cumulative metrics such as `time/cum/trainer_s`, `data/cum/num_unique_scenarios`, and `costs/cum/all`
+- cost rollups such as `costs/train`, `costs/eval`, and `costs/all`
+- throughput metrics such as `throughput/avg_trainer_tok_per_s` and `throughput/avg_actor_tok_per_s`
+
+<Note>
+  Some metrics only appear when the backend or your code provides the underlying
+  inputs. For example, `throughput/avg_actor_tok_per_s` requires both
+  `data/step_actor_tokens` and `time/step_actor_s`.
+</Note>
+
+## Add task-specific outcome metrics
+
+Attach metrics directly to each `Trajectory` when your rollout code knows whether
+an attempt succeeded, how many tools it called, or any other task-specific
+signal.
+
+```python
+async def rollout(model: art.Model, scenario: Scenario) -> art.Trajectory:
+    trajectory = art.Trajectory(
+        messages_and_choices=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": scenario.prompt},
+        ],
+        metadata={"scenario_id": scenario.id},
+    )
+
+    completion = await model.openai_client().chat.completions.create(
+        model=model.get_inference_name(),
+        messages=trajectory.messages(),
+    )
+    trajectory.messages_and_choices.append(completion.choices[0])
+
+    trajectory.reward = score_reward(trajectory)
+    trajectory.metrics["correct"] = float(is_correct(trajectory))
+    trajectory.metrics["tool_calls"] = float(count_tool_calls(trajectory))
+    return trajectory
+```
+
+On train steps, ART averages those rollout metrics and logs them under the
+`reward/` namespace, such as `reward/correct` and `reward/tool_calls`.
+
+If you want to record one value per `TrajectoryGroup` instead of one per
+trajectory, pass `metrics={...}` when you build the group. ART logs those once
+per group, using keys like `reward/group_difficulty` on train steps.
+
+## Add step-level metrics ART cannot infer
+
+Use `model.metrics_builder()` for metrics that live outside individual
+trajectories, such as actor-side timing, token counts, or idle time.
+
+```python
+builder = model.metrics_builder()
+
+with builder.measure("time/step_actor_s"):
+    result = await run_rollouts()
+
+builder.add_data(
+    step_num_scenarios=result.num_scenarios,
+    step_actor_tokens=result.actor_tokens,
+    scenario_ids=result.scenario_ids,
+)
+builder.add_idle_times(step_actor_idle_s=result.actor_idle_s)
+
+await model.log(result.train_groups, split="train", step=result.step)
+```
+
+A few useful patterns:
+
+- log `scenario_ids` to unlock `data/cum/num_unique_scenarios`
+- log both `data/step_actor_tokens` and `time/step_actor_s` to unlock actor throughput metrics
+- log `time/step_eval_s` when eval runs happen outside the backend
+- use fully qualified keys like `time/step_actor_s` or `data/step_actor_tokens` for builder-managed metrics
+
+ART flushes builder-managed metrics on the next `model.log(...)` or
+`model.train(...)` call.
+
+## Track judge and API costs
+
+Use `@track_api_cost` when a function returns a provider response object with
+token usage. Wrap the relevant part of your code in a metrics context so ART
+knows whether the spend belongs to training or evaluation.
+
+```python
+from art.metrics import track_api_cost
+
+@track_api_cost(
+    source="llm_judge/correctness",
+    provider="openai",
+    model_name="openai/gpt-4.1",
+)
+async def run_judge(client, messages):
+    return await client.chat.completions.create(
+        model="gpt-4.1",
+        messages=messages,
+    )
+
+with model.metrics_builder("train").activate_context():
+    await run_judge(judge_client, train_messages)
+
+with model.metrics_builder("eval").activate_context():
+    await run_judge(judge_client, eval_messages)
+```
+
+The next metrics row will include:
+
+- `costs/train/llm_judge/correctness` or `costs/eval/llm_judge/correctness`
+- rollups such as `costs/train`, `costs/eval`, and `costs/all`
+- cumulative totals such as `costs/cum/all`
+
+ART can price OpenAI and Anthropic responses from their usage fields. You must
+pass both `provider` and `model_name` to `@track_api_cost`.
+
+For custom pricing or unsupported models, register pricing on the builder:
+
+```python
+builder = model.metrics_builder()
+builder.register_model_pricing(
+    "anthropic/my-custom-judge",
+    prompt_per_million=1.2,
+    completion_per_million=4.8,
+)
+```
+
+## Track GPU cost on LocalBackend
+
+`LocalBackend` can log `costs/gpu` automatically on train steps. ART currently
+auto-detects H200 pricing at `$3/hour` per GPU. For other hardware, pass an
+explicit override:
+
+```python
+backend = LocalBackend(gpu_cost_per_hour_usd=2.25)
+```
+
+This lets ART include GPU spend in the same metrics stream as rewards, losses,
+and judge/API costs.
diff --git a/docs/getting-started/quick-start.mdx b/docs/getting-started/quick-start.mdx
index 58eb0ccf0..63a38e02c 100644
--- a/docs/getting-started/quick-start.mdx
+++ b/docs/getting-started/quick-start.mdx
@@ -38,4 +38,4 @@ At the top of the [notebook](https://colab.research.google.com/github/openpipe/a
 
 ## Step 3: Track metrics
 
-While your run progresses, observe its traces and metrics in your [W&B workspace](https://wandb.ai/home). You should start seeing some progress in the first 20-30 steps. If you have questions along the way, please ask in the [Discord](https://discord.gg/zbBHRUpwf4). Happy training!
+While your run progresses, observe its traces and metrics in your [W&B workspace](https://wandb.ai/home). You should start seeing some progress in the first 20-30 steps. For a guide to the metrics ART logs automatically and how to add your own, see [Tracking Metrics](/features/tracking-metrics). If you have questions along the way, please ask in the [Discord](https://discord.gg/zbBHRUpwf4). Happy training!
diff --git a/pyproject.toml b/pyproject.toml
index f5563f66e..1941244ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -116,6 +116,9 @@ force-sort-within-sections = true
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
+markers = [
+    "live_api_cost: opt-in live API cost validation against provider endpoints",
+]
 
 [tool.uv]
 required-version = ">=0.6.15"
diff --git a/src/art/api_costs.py b/src/art/api_costs.py
new file mode 100644
index 000000000..1bbd9ed95
--- /dev/null
+++ b/src/art/api_costs.py
@@ -0,0 +1,502 @@
+from __future__ import annotations
+
+from collections.abc import Awaitable, Callable, Mapping
+from dataclasses import dataclass
+from functools import wraps
+from inspect import iscoroutinefunction
+from typing import Any, ParamSpec, TypeVar, cast
+
+from .costs import get_model_pricing, tokens_to_cost
+
+OPENAI_PROVIDER = "openai"
+ANTHROPIC_PROVIDER = "anthropic"
+
+P = ParamSpec("P")
+R = TypeVar("R")
+
+CostExtractor = Callable[[Any], float | None]
+ResponseGetter = Callable[[Any], Any]
+
+
+@dataclass(frozen=True)
+class TokenPricing:
+    prompt_per_million: float
+    completion_per_million: float
+    cached_prompt_per_million: float | None = None
+    cache_creation_per_million: float | None = None
+    cache_read_per_million: float | None = None
+
+
+@dataclass(frozen=True)
+class _OpenAITokenUsage:
+    prompt_tokens: float
+    completion_tokens: float
+    cached_prompt_tokens: float
+
+
+@dataclass(frozen=True)
+class _AnthropicTokenUsage:
+    input_tokens: float
+    output_tokens: float
+    cache_creation_input_tokens: float
+    cache_read_input_tokens: float
+
+
+MODEL_TOKEN_PRICING: dict[str, TokenPricing] = {
+    "openai/gpt-4.1": TokenPricing(
+        prompt_per_million=2.0,
+        completion_per_million=8.0,
+        cached_prompt_per_million=0.5,
+    ),
+    "anthropic/claude-sonnet-4-6": TokenPricing(
+        prompt_per_million=3.0,
+        completion_per_million=15.0,
+        cache_creation_per_million=3.75,
+        cache_read_per_million=0.30,
+    ),
+}
+
+
+def _configured_token_pricing(model_name: str) -> TokenPricing | None:
+    explicit = MODEL_TOKEN_PRICING.get(model_name)
+    if explicit is not None:
+        return explicit
+
+    pricing = get_model_pricing(model_name)
+    if pricing is None:
+        return None
+    return TokenPricing(
+        prompt_per_million=pricing.prefill,
+        completion_per_million=pricing.sample,
+    )
+
+
+def normalize_provider(provider: str | None) -> str | None:
+    if provider is None:
+        return None
+    normalized = provider.strip().lower()
+    if not normalized:
+        return None
+    return normalized
+
+
+def _read_usage_field(usage: Any, field: str) -> float | None:
+    if usage is None:
+        return None
+    if isinstance(usage, dict):
+        value = usage.get(field)
+    else:
+        value = getattr(usage, field, None)
+    if value is None:
+        return None
+    return float(value)
+
+
+def _read_usage_nested_field(usage: Any, *fields: str) -> float | None:
+    current = usage
+    for field in fields:
+        if current is None:
+            return None
+        if isinstance(current, dict):
+            current = current.get(field)
+        else:
+            current = getattr(current, field, None)
+    if current is None:
+        return None
+    return float(current)
+
+
+def _response_usage(response: Any) -> Any:
+    if isinstance(response, dict):
+        return response.get("usage")
+    return getattr(response, "usage", None)
+
+
+def _extract_openai_token_counts(response: Any) -> _OpenAITokenUsage | None:
+    usage = _response_usage(response)
+    prompt_tokens = _read_usage_field(usage, "prompt_tokens")
+    completion_tokens = _read_usage_field(usage, "completion_tokens")
+    cached_prompt_tokens = (
+        _read_usage_nested_field(usage, "prompt_tokens_details", "cached_tokens") or 0.0
+    )
+    if (
+        prompt_tokens is None
+        and completion_tokens is None
+        and cached_prompt_tokens == 0.0
+    ):
+        return None
+    total_prompt_tokens = prompt_tokens or 0.0
+    return _OpenAITokenUsage(
+        prompt_tokens=total_prompt_tokens,
+        completion_tokens=completion_tokens or 0.0,
+        cached_prompt_tokens=min(cached_prompt_tokens, total_prompt_tokens),
+    )
+
+
+def _extract_anthropic_token_counts(response: Any) -> _AnthropicTokenUsage | None:
+    usage = _response_usage(response)
+    input_tokens = _read_usage_field(usage, "input_tokens")
+    output_tokens = _read_usage_field(usage, "output_tokens")
+    cache_creation_input_tokens = (
+        _read_usage_field(usage, "cache_creation_input_tokens") or 0.0
+    )
+    cache_read_input_tokens = _read_usage_field(usage, "cache_read_input_tokens") or 0.0
+    if (
+        input_tokens is None
+        and output_tokens is None
+        and cache_creation_input_tokens == 0.0
+        and cache_read_input_tokens == 0.0
+    ):
+        return None
+    return _AnthropicTokenUsage(
+        input_tokens=input_tokens or 0.0,
+        output_tokens=output_tokens or 0.0,
+        cache_creation_input_tokens=cache_creation_input_tokens,
+        cache_read_input_tokens=cache_read_input_tokens,
+    )
+
+
+def _estimate_openai_cost(
+    token_counts: _OpenAITokenUsage | None,
+    pricing: TokenPricing,
+) -> float | None:
+    if token_counts is None:
+        return None
+    uncached_prompt_tokens = max(
+        token_counts.prompt_tokens - token_counts.cached_prompt_tokens,
+        0.0,
+    )
+    cached_prompt_price = (
+        pricing.cached_prompt_per_million
+        if pricing.cached_prompt_per_million is not None
+        else pricing.prompt_per_million
+    )
+    return (
+        tokens_to_cost(uncached_prompt_tokens, pricing.prompt_per_million)
+        + tokens_to_cost(
+            token_counts.cached_prompt_tokens,
+            cached_prompt_price,
+        )
+        + tokens_to_cost(
+            token_counts.completion_tokens,
+            pricing.completion_per_million,
+        )
+    )
+
+
+def _estimate_anthropic_cost(
+    token_counts: _AnthropicTokenUsage | None,
+    pricing: TokenPricing,
+) -> float | None:
+    if token_counts is None:
+        return None
+    cache_creation_price = (
+        pricing.cache_creation_per_million
+        if pricing.cache_creation_per_million is not None
+        else pricing.prompt_per_million
+    )
+    cache_read_price = (
+        pricing.cache_read_per_million
+        if pricing.cache_read_per_million is not None
+        else pricing.prompt_per_million
+    )
+    return (
+        tokens_to_cost(token_counts.input_tokens, pricing.prompt_per_million)
+        + tokens_to_cost(
+            token_counts.cache_creation_input_tokens,
+            cache_creation_price,
+        )
+        + tokens_to_cost(
+            token_counts.cache_read_input_tokens,
+            cache_read_price,
+        )
+        + tokens_to_cost(
+            token_counts.output_tokens,
+            pricing.completion_per_million,
+        )
+    )
+
+
+def _estimate_provider_cost(
+    provider_name: str,
+    response: Any,
+    pricing: TokenPricing,
+) -> float | None:
+    if provider_name == OPENAI_PROVIDER:
+        return _estimate_openai_cost(_extract_openai_token_counts(response), pricing)
+    if provider_name == ANTHROPIC_PROVIDER:
+        return _estimate_anthropic_cost(
+            _extract_anthropic_token_counts(response),
+            pricing,
+        )
+    return None
+
+
+def _resolve_registered_or_default_pricing(
+    model_name: str,
+    *,
+    model_pricing: Mapping[str, TokenPricing],
+) -> TokenPricing | None:
+    registered = model_pricing.get(model_name)
+    if registered is not None:
+        return registered
+    return _configured_token_pricing(model_name)
+
+
+def _merge_token_pricing(
+    *,
+    base_pricing: TokenPricing,
+    prompt_price_per_million: float | None,
+    completion_price_per_million: float | None,
+    cached_prompt_price_per_million: float | None,
+    cache_creation_price_per_million: float | None,
+    cache_read_price_per_million: float | None,
+) -> TokenPricing:
+    return TokenPricing(
+        prompt_per_million=(
+            float(prompt_price_per_million)
+            if prompt_price_per_million is not None
+            else base_pricing.prompt_per_million
+        ),
+        completion_per_million=(
+            float(completion_price_per_million)
+            if completion_price_per_million is not None
+            else base_pricing.completion_per_million
+        ),
+        cached_prompt_per_million=(
+            float(cached_prompt_price_per_million)
+            if cached_prompt_price_per_million is not None
+            else base_pricing.cached_prompt_per_million
+        ),
+        cache_creation_per_million=(
+            float(cache_creation_price_per_million)
+            if cache_creation_price_per_million is not None
+            else base_pricing.cache_creation_per_million
+        ),
+        cache_read_per_million=(
+            float(cache_read_price_per_million)
+            if cache_read_price_per_million is not None
+            else base_pricing.cache_read_per_million
+        ),
+    )
+
+
+def normalize_model_name(model_name: str | None) -> str | None:
+    if model_name is None:
+        return None
+    normalized = model_name.strip()
+    if not normalized:
+        return None
+    return normalized
+
+
+def _resolve_token_pricing(
+    *,
+    provider: str,
+    model_name: str,
+    prompt_price_per_million: float | None,
+    completion_price_per_million: float | None,
+    cached_prompt_price_per_million: float | None,
+    cache_creation_price_per_million: float | None,
+    cache_read_price_per_million: float | None,
+    model_pricing: Mapping[str, TokenPricing],
+) -> TokenPricing:
+    explicit_prompt_price = (
+        float(prompt_price_per_million)
+        if prompt_price_per_million is not None
+        else None
+    )
+    explicit_completion_price = (
+        float(completion_price_per_million)
+        if completion_price_per_million is not None
+        else None
+    )
+    explicit_cached_prompt_price = (
+        float(cached_prompt_price_per_million)
+        if cached_prompt_price_per_million is not None
+        else None
+    )
+    explicit_cache_creation_price = (
+        float(cache_creation_price_per_million)
+        if cache_creation_price_per_million is not None
+        else None
+    )
+    explicit_cache_read_price = (
+        float(cache_read_price_per_million)
+        if cache_read_price_per_million is not None
+        else None
+    )
+
+    if normalize_provider(provider) is None:
+        raise ValueError("provider must be non-empty")
+
+    normalized_model_name = normalize_model_name(model_name)
+    if normalized_model_name is None:
+        raise ValueError("model_name must be non-empty")
+
+    configured_pricing = _resolve_registered_or_default_pricing(
+        normalized_model_name,
+        model_pricing=model_pricing,
+    )
+    if configured_pricing is None:
+        raise ValueError(
+            f"No pricing configured for model '{normalized_model_name}'. "
+            "Add it to art.api_costs.MODEL_TOKEN_PRICING, art.costs.MODEL_PRICING, "
+            "or register it with MetricsBuilder.register_model_pricing()."
+        )
+
+    return _merge_token_pricing(
+        base_pricing=configured_pricing,
+        prompt_price_per_million=explicit_prompt_price,
+        completion_price_per_million=explicit_completion_price,
+        cached_prompt_price_per_million=explicit_cached_prompt_price,
+        cache_creation_price_per_million=explicit_cache_creation_price,
+        cache_read_price_per_million=explicit_cache_read_price,
+    )
+
+
+def extract_api_cost(
+    response: Any,
+    *,
+    provider: str,
+    model_name: str,
+    prompt_price_per_million: float | None,
+    completion_price_per_million: float | None,
+    cached_prompt_price_per_million: float | None,
+    cache_creation_price_per_million: float | None,
+    cache_read_price_per_million: float | None,
+    cost_extractors: Mapping[str, CostExtractor],
+    model_pricing: Mapping[str, TokenPricing],
+) -> float | None:
+    provider_name = normalize_provider(provider)
+    if provider_name is None:
+        raise ValueError("provider must be non-empty")
+
+    custom_extractor = cost_extractors.get(provider_name)
+    if custom_extractor is not None:
+        custom_cost = custom_extractor(response)
+        if custom_cost is not None:
+            return float(custom_cost)
+
+    pricing = _resolve_token_pricing(
+        provider=provider_name,
+        model_name=model_name,
+        prompt_price_per_million=prompt_price_per_million,
+        completion_price_per_million=completion_price_per_million,
+        cached_prompt_price_per_million=cached_prompt_price_per_million,
+        cache_creation_price_per_million=cache_creation_price_per_million,
+        cache_read_price_per_million=cache_read_price_per_million,
+        model_pricing=model_pricing,
+    )
+    provider_cost = _estimate_provider_cost(provider_name, response, pricing)
+    if provider_cost is not None:
+        return provider_cost
+
+    if provider_name in {OPENAI_PROVIDER, ANTHROPIC_PROVIDER}:
+        raise ValueError(
+            f"Response usage does not match provider '{provider_name}'. "
+            "Pass the correct provider/model pair or register a custom cost extractor."
+        )
+    raise ValueError(f"No cost extractor registered for provider '{provider_name}'.")
+
+
+def _record_api_cost(
+    *,
+    result: Any,
+    source: str,
+    provider: str,
+    response_getter: ResponseGetter | None,
+    model_name: str,
+    prompt_price_per_million: float | None,
+    completion_price_per_million: float | None,
+    cached_prompt_price_per_million: float | None,
+    cache_creation_price_per_million: float | None,
+    cache_read_price_per_million: float | None,
+) -> None:
+    try:
+        from .metrics import MetricsBuilder
+
+        builder = MetricsBuilder.get_active()
+    except LookupError:
+        return
+
+    response = response_getter(result) if response_getter is not None else result
+    builder.add_response_cost(
+        source,
+        response,
+        provider=provider,
+        model_name=model_name,
+        prompt_price_per_million=prompt_price_per_million,
+        completion_price_per_million=completion_price_per_million,
+        cached_prompt_price_per_million=cached_prompt_price_per_million,
+        cache_creation_price_per_million=cache_creation_price_per_million,
+        cache_read_price_per_million=cache_read_price_per_million,
+    )
+
+
+def track_api_cost(
+    *,
+    source: str,
+    provider: str,
+    model_name: str,
+    response_getter: ResponseGetter | None = None,
+    prompt_price_per_million: float | None = None,
+    completion_price_per_million: float | None = None,
+    cached_prompt_price_per_million: float | None = None,
+    cache_creation_price_per_million: float | None = None,
+    cache_read_price_per_million: float | None = None,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    normalized_source = source.strip("/")
+    if not normalized_source:
+        raise ValueError("source must be non-empty")
+
+    normalized_provider = normalize_provider(provider)
+    if normalized_provider is None:
+        raise ValueError("provider must be non-empty")
+    normalized_model_name = normalize_model_name(model_name)
+    if normalized_model_name is None:
+        raise ValueError("model_name must be non-empty")
+
+    def _decorate(func: Callable[P, R]) -> Callable[P, R]:
+        if iscoroutinefunction(func):
+            async_func = cast(Callable[P, Awaitable[Any]], func)
+
+            @wraps(func)
+            async def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
+                result = await async_func(*args, **kwargs)
+                _record_api_cost(
+                    result=result,
+                    source=normalized_source,
+                    provider=normalized_provider,
+                    response_getter=response_getter,
+                    model_name=normalized_model_name,
+                    prompt_price_per_million=prompt_price_per_million,
+                    completion_price_per_million=completion_price_per_million,
+                    cached_prompt_price_per_million=cached_prompt_price_per_million,
+                    cache_creation_price_per_million=cache_creation_price_per_million,
+                    cache_read_price_per_million=cache_read_price_per_million,
+                )
+                return result
+
+            return cast(Callable[P, R], _async_wrapper)
+
+        @wraps(func)
+        def _sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+            result = func(*args, **kwargs)
+            _record_api_cost(
+                result=result,
+                source=normalized_source,
+                provider=normalized_provider,
+                response_getter=response_getter,
+                model_name=normalized_model_name,
+                prompt_price_per_million=prompt_price_per_million,
+                completion_price_per_million=completion_price_per_million,
+                cached_prompt_price_per_million=cached_prompt_price_per_million,
+                cache_creation_price_per_million=cache_creation_price_per_million,
+                cache_read_price_per_million=cache_read_price_per_million,
+            )
+            return result
+
+        return _sync_wrapper
+
+    return _decorate
diff --git a/src/art/costs.py b/src/art/costs.py
index 5ee5523a9..e3e2b2b47 100644
--- a/src/art/costs.py
+++ b/src/art/costs.py
@@ -16,7 +16,7 @@ class ModelPricing:
 
 
 TokenCount: TypeAlias = int | None
-CostCalculator: TypeAlias = Callable[[TokenCount, TokenCount], dict[str, float]]
+CostCalculator: TypeAlias = Callable[[TokenCount, TokenCount, str], dict[str, float]]
 
 # Pricing per model ($/1M tokens). Keep in sync with infra pricing.
 MODEL_PRICING: dict[str, ModelPricing] = {
@@ -88,16 +88,20 @@ def compute_sample_costs(
     *,
     prompt_tokens: int | None,
     completion_tokens: int | None,
+    cost_context: str,
     pricing: ModelPricing,
 ) -> dict[str, float]:
     """Compute prompt+completion costs for a single API call."""
+    normalized_context = cost_context.strip("/")
+    if not normalized_context:
+        raise ValueError("cost_context must be non-empty")
     prompt_value = float(prompt_tokens or 0)
     completion_value = float(completion_tokens or 0)
     prefill_cost = tokens_to_cost(prompt_value, pricing.prefill)
     sample_cost = tokens_to_cost(completion_value, pricing.sample)
     return {
-        "costs_prefill": prefill_cost,
-        "costs_sample": sample_cost,
+        f"costs/{normalized_context}/prefill": prefill_cost,
+        f"costs/{normalized_context}/sample": sample_cost,
     }
 
 
@@ -105,11 +109,14 @@ def build_cost_calculator(pricing: ModelPricing) -> CostCalculator:
     """Return a callable that computes prompt+completion costs for a request."""
 
     def _calculator(
-        prompt_tokens: int | None, completion_tokens: int | None
+        prompt_tokens: int | None,
+        completion_tokens: int | None,
+        cost_context: str,
     ) -> dict[str, float]:
         return compute_sample_costs(
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
+            cost_context=cost_context,
             pricing=pricing,
         )
 
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index 876768938..c8b0570a2 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -6,12 +6,17 @@
 import shutil
 import socket
 import subprocess
+import time
 from types import TracebackType
 from typing import AsyncIterator, Iterable, Literal, cast
 import warnings
 
 logger = logging.getLogger(__name__)
 
+_AUTO_GPU_HOURLY_PRICING_USD = {
+    "H200": 3.0,
+}
+
 import aiohttp
 import numpy as np
 from openai import AsyncOpenAI
@@ -39,6 +44,12 @@
 
 from .. import dev
 from ..backend import AnyTrainableModel, Backend
+from ..metrics_taxonomy import (
+    TRAIN_GRADIENT_STEPS_KEY,
+    average_metric_samples,
+    build_training_summary_metrics,
+    summarize_trajectory_groups,
+)
 from ..model import Model, TrainableModel
 from ..preprocessing.pack import (
     PackedTensors,
@@ -60,7 +71,13 @@
 
 
 class LocalBackend(Backend):
-    def __init__(self, *, in_process: bool = False, path: str | None = None) -> None:
+    def __init__(
+        self,
+        *,
+        in_process: bool = False,
+        path: str | None = None,
+        gpu_cost_per_hour_usd: float | None = None,
+    ) -> None:
         """
         Initializes a local, directory-based Backend interface at the given path.
 
@@ -71,9 +88,16 @@ def __init__(self, *, in_process: bool = False, path: str | None = None) -> None
         Args:
             in_process: Whether to run the local service in-process.
             path: The path to the local directory. Defaults to "{repo_root}/.art".
+            gpu_cost_per_hour_usd: Optional per-GPU hourly price override used for
+                automatic `costs/gpu` accounting on train steps. When unset,
+                ART auto-detects supported GPU types (H200 at $3/hr today) and
+                skips GPU cost logging for unknown devices instead of guessing.
         """
         self._in_process = in_process
         self._path = path or get_default_art_path()
+        self._gpu_cost_per_hour_usd = (
+            float(gpu_cost_per_hour_usd) if gpu_cost_per_hour_usd is not None else None
+        )
         os.makedirs(self._path, exist_ok=True)
 
         # Other initialization
@@ -81,6 +105,57 @@ def __init__(self, *, in_process: bool = False, path: str | None = None) -> None
         self._tokenizers: dict[str, PreTrainedTokenizerBase] = {}
         self._image_processors: dict[str, BaseImageProcessor | None] = {}
 
+    def supports_automatic_train_step_metrics(self) -> bool:
+        return True
+
+    def automatic_gpu_cost_per_hour_usd(self, model: Model) -> float | None:
+        per_gpu_cost = self._resolve_gpu_cost_per_hour_usd()
+        if per_gpu_cost is None:
+            return None
+
+        gpu_count = self._allocated_gpu_count(model)
+        if gpu_count <= 0:
+            return None
+        return per_gpu_cost * gpu_count
+
+    def _resolve_gpu_cost_per_hour_usd(self) -> float | None:
+        if self._gpu_cost_per_hour_usd is not None:
+            return self._gpu_cost_per_hour_usd
+        if not torch.cuda.is_available():
+            return None
+
+        num_visible_gpus = torch.cuda.device_count()
+        if num_visible_gpus <= 0:
+            return None
+
+        resolved_costs: list[float] = []
+        for index in range(num_visible_gpus):
+            device_name = torch.cuda.get_device_name(index).upper()
+            for gpu_name, hourly_cost in _AUTO_GPU_HOURLY_PRICING_USD.items():
+                if gpu_name in device_name:
+                    resolved_costs.append(hourly_cost)
+                    break
+            else:
+                return None
+
+        if not resolved_costs:
+            return None
+        if len(set(resolved_costs)) != 1:
+            return None
+        return resolved_costs[0]
+
+    def _allocated_gpu_count(self, model: Model) -> int:
+        if isinstance(model, TrainableModel) and model._internal_config is not None:
+            trainer_gpu_ids = set(model._internal_config.get("trainer_gpu_ids", []))
+            inference_gpu_ids = set(model._internal_config.get("inference_gpu_ids", []))
+            allocated_gpu_ids = trainer_gpu_ids | inference_gpu_ids
+            if allocated_gpu_ids:
+                return len(allocated_gpu_ids)
+
+        if not torch.cuda.is_available():
+            return 0
+        return torch.cuda.device_count()
+
     def __enter__(self) -> Self:
         return self
 
@@ -565,20 +640,28 @@ async def train(  # type: ignore[override]
 
         # Collect metrics from training
         training_metrics: list[dict[str, float]] = []
+        trainer_started = time.monotonic()
         async for metrics in self._train_model(
             model, groups_list, config, dev_config, verbose
         ):
             training_metrics.append(metrics)
 
         # Aggregate metrics
-        avg_metrics: dict[str, float] = {}
-        if training_metrics:
-            avg_metrics = {
-                k: sum(d.get(k, 0) for d in training_metrics)
-                / sum(1 for d in training_metrics if k in d)
-                for k in {k for d in training_metrics for k in d}
-                if k != "num_gradient_steps"
+        avg_metrics = average_metric_samples(training_metrics)
+        summary = summarize_trajectory_groups(groups_list)
+        avg_metrics.setdefault(
+            "time/step_trainer_s", time.monotonic() - trainer_started
+        )
+        avg_metrics.update(
+            {
+                key: value
+                for key, value in build_training_summary_metrics(
+                    summary,
+                    include_trainable_groups=True,
+                ).items()
+                if key not in avg_metrics
             }
+        )
 
         # Get step and checkpoint path
         step = await self._get_step(model)
@@ -616,12 +699,10 @@ async def _train_model(
         if verbose:
             print("Packing tensors...")
 
-        # Count submitted groups and trainable groups
-        num_groups_submitted = len(trajectory_groups)
-        num_groups_trainable = sum(
-            1
-            for group in trajectory_groups
-            if group and len(set(trajectory.reward for trajectory in group)) > 1
+        summary = summarize_trajectory_groups(trajectory_groups)
+        base_metrics = build_training_summary_metrics(
+            summary,
+            include_trainable_groups=True,
         )
 
         packed_tensors = self._get_packed_tensors(
@@ -684,29 +765,36 @@ async def _train_model(
             # Yield metrics showing no groups were trainable
             # (the frontend will handle logging)
             yield {
-                "num_groups_submitted": num_groups_submitted,
-                "num_groups_trainable": 0,
-                "num_gradient_steps": 0,
+                **base_metrics,
+                "data/step_num_groups_trainable": 0.0,
+                "train/num_groups_trainable": 0.0,
+                "data/step_trainer_tokens": 0.0,
+                TRAIN_GRADIENT_STEPS_KEY: 0.0,
             }
             return
+        base_metrics["data/step_trainer_tokens"] = float(
+            packed_tensors["assistant_mask"].sum().item()
+        )
         disk_packed_tensors = packed_tensors_to_dir(
             packed_tensors, f"{get_model_dir(model=model, art_path=self._path)}/tensors"
         )
         # Note: scale_learning_rate_by_reward_std_dev is now handled by the frontend (Model.train())
-        results: list[dict[str, float]] = []
         estimated_gradient_steps = disk_packed_tensors["num_sequences"]
         pbar = tqdm.tqdm(total=estimated_gradient_steps, desc="train")
         async for result in service.train(
             disk_packed_tensors, config, dev_config, verbose
         ):
             num_gradient_steps = int(
-                result.pop("num_gradient_steps", estimated_gradient_steps)
+                result.pop(TRAIN_GRADIENT_STEPS_KEY, estimated_gradient_steps)
             )
             assert num_gradient_steps == estimated_gradient_steps, (
                 f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}"
             )
-            results.append(result)
-            yield {**result, "num_gradient_steps": num_gradient_steps}
+            yield {
+                **base_metrics,
+                **result,
+                TRAIN_GRADIENT_STEPS_KEY: float(num_gradient_steps),
+            }
             pbar.update(1)
             pbar.set_postfix(result)
         pbar.close()
@@ -793,15 +881,22 @@ async def _train_sft(
         service = await self._get_service(model)
 
         pbar = tqdm.tqdm(total=len(batches), desc="sft train")
-        total_trainable_tokens = 0
+        total_trainable_tokens = sum(batch.num_trainable_tokens for batch in batches)
+        total_trajectories = len(trajectory_list)
         batch_count = 0
 
         async for result in service.train_sft(batches, verbose):
             pbar.update(1)
-            pbar.set_postfix({"loss": f"{result.get('loss', 0):.4f}"})
-            total_trainable_tokens += result.get("num_trainable_tokens", 0)
+            pbar.set_postfix({"loss": f"{result.get('loss/train', 0):.4f}"})
             batch_count += 1
-            yield result
+            yield {
+                **result,
+                "data/step_num_trajectories": float(total_trajectories),
+                "data/step_trainer_tokens": float(total_trainable_tokens),
+                TRAIN_GRADIENT_STEPS_KEY: float(len(batches)),
+                "train/num_trajectories": float(total_trajectories),
+                "train/num_trainable_tokens": float(total_trainable_tokens),
+            }
 
         pbar.close()
 
diff --git a/src/art/megatron/train.py b/src/art/megatron/train.py
index 85c36d1fa..02e3b7cd9 100644
--- a/src/art/megatron/train.py
+++ b/src/art/megatron/train.py
@@ -282,8 +282,8 @@ def print0(*values: Any) -> None:
             with open("/tmp/megatron_training_log.jsonl", "a+") as log_file:
                 log_msg = json.dumps(
                     {
-                        "loss": loss.item(),
-                        "grad_norm": grad_norm,
+                        "loss/train": loss.item(),
+                        "loss/grad_norm": grad_norm,
                         "probs_corr": probs_corr,
                     }
                 )
diff --git a/src/art/metrics.py b/src/art/metrics.py
new file mode 100644
index 000000000..eda9ab9ca
--- /dev/null
+++ b/src/art/metrics.py
@@ -0,0 +1,380 @@
+from __future__ import annotations
+
+import asyncio
+from contextlib import contextmanager
+from contextvars import ContextVar, Token
+from dataclasses import dataclass
+import time
+from typing import Any
+
+from .api_costs import (
+    CostExtractor,
+    TokenPricing,
+    extract_api_cost,
+    normalize_model_name,
+    normalize_provider,
+)
+
+_active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder")
+
+_HIERARCHICAL_SECTIONS = {"costs", "time", "data"}
+_THROUGHPUT_IDLE_MAPPINGS = {
+    "throughput/step_trainer_idle_s": "throughput/cum/trainer_idle_s",
+    "throughput/step_actor_idle_s": "throughput/cum/actor_idle_s",
+}
+
+
+def is_cumulative_metric_key(key: str) -> bool:
+    parts = key.split("/", 2)
+    return len(parts) >= 2 and parts[1] == "cum"
+
+
+def is_builder_managed_metric(key: str) -> bool:
+    return key.startswith(("costs/", "time/step_", "data/step_", "throughput/step_"))
+
+
+def to_cumulative_metric_key(key: str) -> str:
+    if is_cumulative_metric_key(key):
+        raise ValueError(f"Metric key '{key}' is already cumulative.")
+
+    section, rest = key.split("/", 1)
+    if rest.startswith("step_"):
+        rest = rest[len("step_") :]
+    return f"{section}/cum/{rest}"
+
+
+@dataclass
+class _SharedMetricsState:
+    lock: asyncio.Lock
+    step_buffer: dict[str, float]
+    cum_state: dict[str, float]
+    unique_scenario_ids: set[str]
+    pending_scenario_ids: set[str]
+    cost_extractors: dict[str, CostExtractor]
+    model_pricing: dict[str, TokenPricing]
+
+
+def _new_shared_metrics_state() -> _SharedMetricsState:
+    return _SharedMetricsState(
+        lock=asyncio.Lock(),
+        step_buffer={},
+        cum_state={},
+        unique_scenario_ids=set(),
+        pending_scenario_ids=set(),
+        cost_extractors={},
+        model_pricing={},
+    )
+
+
+class MetricsBuilder:
+    """Build and accumulate step-level metrics for logging."""
+
+    def __init__(
+        self,
+        cost_context: str,
+        *,
+        _shared_state: _SharedMetricsState | None = None,
+    ) -> None:
+        if not cost_context:
+            raise ValueError("cost_context must be non-empty")
+
+        self.cost_context = cost_context
+        self._shared_state = (
+            _shared_state if _shared_state is not None else _new_shared_metrics_state()
+        )
+
+    def add_cost(self, path: str, usd: float) -> None:
+        if not path:
+            raise ValueError("Cost path must be non-empty")
+        full_key = f"costs/{path}"
+        self.add_metric(full_key, float(usd))
+
+    def add_response_cost(
+        self,
+        source: str,
+        response: Any,
+        *,
+        provider: str,
+        model_name: str,
+        prompt_price_per_million: float | None = None,
+        completion_price_per_million: float | None = None,
+        cached_prompt_price_per_million: float | None = None,
+        cache_creation_price_per_million: float | None = None,
+        cache_read_price_per_million: float | None = None,
+    ) -> float | None:
+        normalized_source = source.strip("/")
+        if not normalized_source:
+            raise ValueError("source must be non-empty")
+
+        cost = extract_api_cost(
+            response,
+            provider=provider,
+            model_name=model_name,
+            prompt_price_per_million=prompt_price_per_million,
+            completion_price_per_million=completion_price_per_million,
+            cached_prompt_price_per_million=cached_prompt_price_per_million,
+            cache_creation_price_per_million=cache_creation_price_per_million,
+            cache_read_price_per_million=cache_read_price_per_million,
+            cost_extractors=self._shared_state.cost_extractors,
+            model_pricing=self._shared_state.model_pricing,
+        )
+        if cost is None:
+            return None
+
+        self.add_cost(f"{self.cost_context}/{normalized_source}", cost)
+        return cost
+
+    def add_metric(self, key: str, value: float) -> None:
+        if "/" not in key:
+            raise ValueError("Metric key must include a section prefix")
+        self._validate_and_add(key, float(value))
+
+    def add_data(
+        self,
+        step_num_scenarios: int | None = None,
+        step_actor_tokens: int | None = None,
+        scenario_ids: list[str] | None = None,
+    ) -> None:
+        if step_num_scenarios is not None:
+            self.add_metric("data/step_num_scenarios", float(step_num_scenarios))
+        if step_actor_tokens is not None:
+            self.add_metric("data/step_actor_tokens", float(step_actor_tokens))
+        if scenario_ids is not None:
+            self._shared_state.pending_scenario_ids.update(
+                str(scenario_id) for scenario_id in scenario_ids
+            )
+
+    def add_user_timing(
+        self,
+        step_wall_s: float | None = None,
+        step_actor_s: float | None = None,
+        step_eval_s: float | None = None,
+    ) -> None:
+        if step_wall_s is not None:
+            self.add_metric("time/step_wall_s", float(step_wall_s))
+        if step_actor_s is not None:
+            self.add_metric("time/step_actor_s", float(step_actor_s))
+        if step_eval_s is not None:
+            self.add_metric("time/step_eval_s", float(step_eval_s))
+
+    def add_idle_times(
+        self,
+        step_trainer_idle_s: float | None = None,
+        step_actor_idle_s: float | None = None,
+    ) -> None:
+        if step_trainer_idle_s is not None:
+            self.add_metric(
+                "throughput/step_trainer_idle_s",
+                float(step_trainer_idle_s),
+            )
+        if step_actor_idle_s is not None:
+            self.add_metric("throughput/step_actor_idle_s", float(step_actor_idle_s))
+
+    @contextmanager
+    def measure(self, key: str):
+        started = time.monotonic()
+        try:
+            yield
+        finally:
+            self.add_metric(key, time.monotonic() - started)
+
+    async def flush(self) -> dict[str, float]:
+        async with self._shared_state.lock:
+            result = dict(self._shared_state.step_buffer)
+            cost_metrics = {
+                key: value
+                for key, value in self._shared_state.step_buffer.items()
+                if key.startswith("costs/")
+            }
+            result.update(self._compute_rollups(cost_metrics))
+
+            for key, value in list(result.items()):
+                section = key.split("/", 1)[0]
+                if section not in _HIERARCHICAL_SECTIONS:
+                    continue
+                cum_key = to_cumulative_metric_key(key)
+                next_value = self._shared_state.cum_state.get(cum_key, 0.0) + value
+                self._shared_state.cum_state[cum_key] = next_value
+                result[cum_key] = next_value
+
+            if self._shared_state.pending_scenario_ids:
+                self._shared_state.unique_scenario_ids.update(
+                    self._shared_state.pending_scenario_ids
+                )
+                result["data/cum/num_unique_scenarios"] = float(
+                    len(self._shared_state.unique_scenario_ids)
+                )
+
+            self._update_throughput_metrics(result)
+            self._shared_state.step_buffer.clear()
+            self._shared_state.pending_scenario_ids.clear()
+            return result
+
+    def activate(self) -> Token["MetricsBuilder"]:
+        return _active_builder.set(self)
+
+    @contextmanager
+    def activate_context(self):
+        token = self.activate()
+        try:
+            yield self
+        finally:
+            token.var.reset(token)
+
+    @staticmethod
+    def get_active() -> "MetricsBuilder":
+        return _active_builder.get()
+
+    def for_cost_context(self, cost_context: str) -> "MetricsBuilder":
+        normalized_cost_context = cost_context.strip()
+        if not normalized_cost_context:
+            raise ValueError("cost_context must be non-empty")
+        if normalized_cost_context == self.cost_context:
+            return self
+        return MetricsBuilder(
+            cost_context=normalized_cost_context,
+            _shared_state=self._shared_state,
+        )
+
+    def register_cost_extractor(self, provider: str, extractor: CostExtractor) -> None:
+        normalized_provider = normalize_provider(provider)
+        if normalized_provider is None:
+            raise ValueError("provider must be non-empty")
+        self._shared_state.cost_extractors[normalized_provider] = extractor
+
+    def register_model_pricing(
+        self,
+        model_name: str,
+        *,
+        prompt_per_million: float,
+        completion_per_million: float,
+        cached_prompt_per_million: float | None = None,
+        cache_creation_per_million: float | None = None,
+        cache_read_per_million: float | None = None,
+    ) -> None:
+        normalized_model_name = normalize_model_name(model_name)
+        if not normalized_model_name:
+            raise ValueError("model_name must be non-empty")
+        self._shared_state.model_pricing[normalized_model_name] = TokenPricing(
+            prompt_per_million=float(prompt_per_million),
+            completion_per_million=float(completion_per_million),
+            cached_prompt_per_million=(
+                float(cached_prompt_per_million)
+                if cached_prompt_per_million is not None
+                else None
+            ),
+            cache_creation_per_million=(
+                float(cache_creation_per_million)
+                if cache_creation_per_million is not None
+                else None
+            ),
+            cache_read_per_million=(
+                float(cache_read_per_million)
+                if cache_read_per_million is not None
+                else None
+            ),
+        )
+
+    def state_dict(self) -> dict[str, Any]:
+        return {
+            "cum_state": dict(self._shared_state.cum_state),
+            "unique_scenario_ids": list(self._shared_state.unique_scenario_ids),
+        }
+
+    def load_state_dict(self, state: dict[str, Any]) -> None:
+        raw_cum_state = state.get("cum_state", {})
+        raw_unique_ids = state.get("unique_scenario_ids", [])
+        restored_cum_state = {str(k): float(v) for k, v in raw_cum_state.items()}
+        restored_unique_ids = {str(v) for v in raw_unique_ids}
+
+        self._shared_state.cum_state.clear()
+        self._shared_state.cum_state.update(restored_cum_state)
+        self._shared_state.unique_scenario_ids.clear()
+        self._shared_state.unique_scenario_ids.update(restored_unique_ids)
+        self._shared_state.pending_scenario_ids.clear()
+
+    def _validate_and_add(self, key: str, value: float) -> None:
+        if is_cumulative_metric_key(key):
+            raise ValueError(
+                f"Metric key '{key}' uses the reserved cumulative namespace."
+            )
+
+        for existing_key in self._shared_state.step_buffer:
+            if existing_key == key:
+                continue
+            if existing_key.startswith(f"{key}/"):
+                raise ValueError(
+                    f"Cannot log '{key}' as a leaf: it is an ancestor of '{existing_key}'."
+                )
+            if key.startswith(f"{existing_key}/"):
+                raise ValueError(
+                    f"Cannot log '{key}' as a leaf: '{existing_key}' is already a leaf ancestor."
+                )
+
+        self._shared_state.step_buffer[key] = (
+            self._shared_state.step_buffer.get(key, 0.0) + value
+        )
+
+    def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]:
+        if not cost_metrics:
+            return {}
+
+        all_parents: set[str] = set()
+        for key in cost_metrics:
+            parts = key.split("/")
+            for depth in range(2, len(parts)):
+                all_parents.add("/".join(parts[:depth]))
+
+        rollups: dict[str, float] = {}
+        for parent in all_parents:
+            prefix = f"{parent}/"
+            rollups[parent] = sum(
+                value for key, value in cost_metrics.items() if key.startswith(prefix)
+            )
+
+        top_level_children = {key.split("/")[1] for key in cost_metrics}
+        costs_all = 0.0
+        for child_name in top_level_children:
+            child_key = f"costs/{child_name}"
+            if child_key in rollups:
+                costs_all += rollups[child_key]
+            else:
+                costs_all += cost_metrics[child_key]
+        rollups["costs/all"] = costs_all
+
+        return rollups
+
+    def _update_throughput_metrics(self, result: dict[str, float]) -> None:
+        for step_key, cum_key in _THROUGHPUT_IDLE_MAPPINGS.items():
+            if step_key not in result:
+                continue
+            next_value = (
+                self._shared_state.cum_state.get(cum_key, 0.0) + result[step_key]
+            )
+            self._shared_state.cum_state[cum_key] = next_value
+            result[cum_key] = next_value
+
+        if "data/step_trainer_tokens" in result or "time/step_trainer_s" in result:
+            trainer_tokens = self._shared_state.cum_state.get("data/cum/trainer_tokens")
+            trainer_seconds = self._shared_state.cum_state.get("time/cum/trainer_s")
+            if (
+                trainer_tokens is not None
+                and trainer_seconds is not None
+                and trainer_seconds > 0
+            ):
+                result["throughput/avg_trainer_tok_per_s"] = (
+                    trainer_tokens / trainer_seconds
+                )
+
+        if "data/step_actor_tokens" in result or "time/step_actor_s" in result:
+            actor_tokens = self._shared_state.cum_state.get("data/cum/actor_tokens")
+            actor_seconds = self._shared_state.cum_state.get("time/cum/actor_s")
+            if (
+                actor_tokens is not None
+                and actor_seconds is not None
+                and actor_seconds > 0
+            ):
+                result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds
+
+
+from .api_costs import track_api_cost
diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py
new file mode 100644
index 000000000..6965b68db
--- /dev/null
+++ b/src/art/metrics_taxonomy.py
@@ -0,0 +1,140 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Iterable
+
+from .trajectories import TrajectoryGroup
+
+TRAIN_GRADIENT_STEPS_KEY = "data/step_num_gradient_steps"
+_INVARIANT_METRIC_KEYS = frozenset({TRAIN_GRADIENT_STEPS_KEY})
+
+
+def average_metric_samples(
+    metric_samples: Iterable[dict[str, float]],
+) -> dict[str, float]:
+    totals: dict[str, float] = {}
+    counts: dict[str, int] = {}
+    invariant_values: dict[str, float] = {}
+
+    for sample in metric_samples:
+        for key, value in sample.items():
+            numeric_value = float(value)
+            if key in _INVARIANT_METRIC_KEYS:
+                previous_value = invariant_values.get(key)
+                if previous_value is None:
+                    invariant_values[key] = numeric_value
+                elif previous_value != numeric_value:
+                    raise ValueError(
+                        f"Metric '{key}' must be invariant across samples, "
+                        f"got {previous_value} and {numeric_value}."
+                    )
+
+            totals[key] = totals.get(key, 0.0) + numeric_value
+            counts[key] = counts.get(key, 0) + 1
+
+    return {
+        key: (
+            invariant_values[key]
+            if key in _INVARIANT_METRIC_KEYS
+            else totals[key] / counts[key]
+        )
+        for key in totals
+    }
+
+
+@dataclass(frozen=True)
+class TrajectoryBatchSummary:
+    num_scenarios: int
+    num_trajectories: int
+    num_groups_submitted: int
+    num_groups_trainable: int
+    scenario_ids: list[str]
+
+
+def summarize_trajectory_groups(
+    trajectory_groups: Iterable[TrajectoryGroup],
+) -> TrajectoryBatchSummary:
+    groups = list(trajectory_groups)
+    scenario_ids: list[str] = []
+    seen_scenario_ids: set[str] = set()
+
+    for group in groups:
+        scenario_id = _extract_scenario_id(group)
+        if scenario_id is None or scenario_id in seen_scenario_ids:
+            continue
+        seen_scenario_ids.add(scenario_id)
+        scenario_ids.append(scenario_id)
+
+    return TrajectoryBatchSummary(
+        num_scenarios=len(groups),
+        num_trajectories=sum(
+            len(group.trajectories) + len(group.exceptions) for group in groups
+        ),
+        num_groups_submitted=len(groups),
+        num_groups_trainable=sum(1 for group in groups if _group_is_trainable(group)),
+        scenario_ids=scenario_ids,
+    )
+
+
+def build_data_metrics_from_summary(
+    summary: TrajectoryBatchSummary,
+    *,
+    include_trainable_groups: bool,
+) -> dict[str, float]:
+    metrics = {
+        "data/step_num_scenarios": float(summary.num_scenarios),
+        "data/step_num_trajectories": float(summary.num_trajectories),
+        "data/step_num_groups_submitted": float(summary.num_groups_submitted),
+    }
+    if include_trainable_groups:
+        metrics["data/step_num_groups_trainable"] = float(summary.num_groups_trainable)
+    return metrics
+
+
+def build_train_metrics_from_summary(
+    summary: TrajectoryBatchSummary,
+) -> dict[str, float]:
+    return {
+        "train/num_groups_submitted": float(summary.num_groups_submitted),
+        "train/num_groups_trainable": float(summary.num_groups_trainable),
+        "train/num_trajectories": float(summary.num_trajectories),
+    }
+
+
+def build_training_summary_metrics(
+    summary: TrajectoryBatchSummary,
+    *,
+    include_trainable_groups: bool,
+) -> dict[str, float]:
+    return {
+        **build_data_metrics_from_summary(
+            summary,
+            include_trainable_groups=include_trainable_groups,
+        ),
+        **build_train_metrics_from_summary(summary),
+    }
+
+
+def _group_is_trainable(group: TrajectoryGroup) -> bool:
+    rewards = [trajectory.reward for trajectory in group.trajectories]
+    return len(rewards) > 1 and len(set(rewards)) > 1
+
+
+def _extract_scenario_id(group: TrajectoryGroup) -> str | None:
+    for metadata in [
+        group.metadata,
+        *(trajectory.metadata for trajectory in group.trajectories),
+    ]:
+        scenario_id = _extract_scenario_id_from_metadata(metadata)
+        if scenario_id is not None:
+            return scenario_id
+    return None
+
+
+def _extract_scenario_id_from_metadata(
+    metadata: dict[str, Any],
+) -> str | None:
+    scenario_id = metadata.get("scenario_id")
+    if scenario_id is None:
+        return None
+    return str(scenario_id)
diff --git a/src/art/model.py b/src/art/model.py
index a5b135824..a5ea06c07 100644
--- a/src/art/model.py
+++ b/src/art/model.py
@@ -1,7 +1,9 @@
 import asyncio
+from contextvars import Token
 from datetime import datetime
 import json
 import os
+import time
 from typing import TYPE_CHECKING, Any, Generic, Iterable, Optional, cast, overload
 import warnings
 
@@ -13,6 +15,14 @@
 
 from . import dev
 from .costs import CostCalculator
+from .metrics import MetricsBuilder, is_builder_managed_metric
+from .metrics_taxonomy import (
+    TRAIN_GRADIENT_STEPS_KEY,
+    average_metric_samples,
+    build_data_metrics_from_summary,
+    build_train_metrics_from_summary,
+    summarize_trajectory_groups,
+)
 from .trajectories import Trajectory, TrajectoryGroup
 from .types import TrainConfig, TrainSFTConfig
 from .utils.trajectory_logging import write_trajectory_groups_parquet
@@ -26,9 +36,20 @@
 ModelConfig = TypeVar("ModelConfig", bound=BaseModel | None)
 StateType = TypeVar("StateType", bound=dict[str, Any], default=dict[str, Any])
 
-COSTS_STATE_KEY = "_costs"
-COSTS_METRIC_PREFIX = "costs_"
-COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total"
+METRICS_BUILDER_STATE_KEY = "_metrics_builder_state"
+METRIC_SECTIONS = frozenset(
+    {
+        "reward",
+        "loss",
+        "offpolicy",
+        "pipeline",
+        "throughput",
+        "costs",
+        "time",
+        "data",
+    }
+)
+METRIC_SPLITS = frozenset({"train", "val", "test"})
 
 
 class Model(
@@ -93,7 +114,13 @@ class Model(
     _s3_prefix: str | None = None
     _openai_client: AsyncOpenAI | None = None
     _wandb_run: Optional["Run"] = None  # Private, for lazy wandb initialization
-    _costs_lock: asyncio.Lock
+    _wandb_defined_metrics: set[str]
+    _run_start_time: float
+    _run_start_monotonic: float
+    _last_local_train_log_monotonic: float
+    _last_local_train_step: int | None
+    _metrics_builder: MetricsBuilder
+    _metrics_builder_state_loaded: bool
     _cost_calculator: CostCalculator
 
     def __init__(
@@ -123,6 +150,17 @@ def __init__(
             report_metrics=report_metrics,
             **kwargs,
         )
+        object.__setattr__(self, "_wandb_defined_metrics", set())
+        object.__setattr__(self, "_run_start_time", time.time())
+        object.__setattr__(self, "_run_start_monotonic", time.monotonic())
+        object.__setattr__(
+            self, "_last_local_train_log_monotonic", self._run_start_monotonic
+        )
+        object.__setattr__(self, "_last_local_train_step", None)
+        object.__setattr__(
+            self, "_metrics_builder", MetricsBuilder(cost_context="train")
+        )
+        object.__setattr__(self, "_metrics_builder_state_loaded", False)
 
     @overload
     def __new__(
@@ -376,13 +414,28 @@ def _get_wandb_run(self) -> Optional["Run"]:
                 ),
             )
             self._wandb_run = run
+            object.__setattr__(
+                self,
+                "_wandb_defined_metrics",
+                {
+                    "training_step",
+                    "time/wall_clock_sec",
+                },
+            )
 
             # Define training_step as the x-axis for all metrics.
             # This allows out-of-order logging (e.g., async validation for previous steps).
             wandb.define_metric("training_step")
+            wandb.define_metric("time/wall_clock_sec")
+            wandb.define_metric("reward/*", step_metric="training_step")
+            wandb.define_metric("loss/*", step_metric="training_step")
+            wandb.define_metric("throughput/*", step_metric="training_step")
+            wandb.define_metric("costs/*", step_metric="training_step")
+            wandb.define_metric("time/*", step_metric="training_step")
+            wandb.define_metric("data/*", step_metric="training_step")
             wandb.define_metric("train/*", step_metric="training_step")
             wandb.define_metric("val/*", step_metric="training_step")
-            wandb.define_metric("costs/*", step_metric="training_step")
+            wandb.define_metric("test/*", step_metric="training_step")
         return self._wandb_run
 
     def _log_metrics(
@@ -392,7 +445,24 @@ def _log_metrics(
         step: int,
     ) -> None:
         """Log metrics to history.jsonl and optionally wandb."""
-        prefixed = {f"{split}/{k}": v for k, v in metrics.items()}
+        if split in METRIC_SPLITS:
+            prefixed = {}
+            for key, value in metrics.items():
+                first_component = key.split("/", 1)[0]
+                has_prefix_component = "/" in key
+                if has_prefix_component and (
+                    first_component in METRIC_SECTIONS
+                    or first_component in METRIC_SPLITS
+                ):
+                    prefixed[key] = value
+                else:
+                    prefixed[f"{split}/{key}"] = value
+        else:
+            prefixed = {f"{split}/{k}": v for k, v in metrics.items()}
+
+        prefixed["training_step"] = step
+        prefixed["time/wall_clock_sec"] = time.time() - self._run_start_time
+
         output_dir = self._get_output_dir()
 
         # Ensure output directory exists
@@ -416,65 +486,158 @@ def _log_metrics(
         ) or (self.report_metrics is not None and "wandb" in self.report_metrics)
         if should_log_wandb:
             if run := self._get_wandb_run():
-                run.log({"training_step": step, **prefixed})
+                self._define_wandb_step_metrics(prefixed.keys())
+                # Let W&B use its own monotonically increasing history step.
+                # ART's `training_step` remains the x-axis via define_metric,
+                # which preserves out-of-order eval logging.
+                run.log(prefixed)
 
-    async def _record_costs(
+    def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None:
+        import wandb
+
+        for key in keys:
+            if not key.startswith("costs/"):
+                continue
+            if key in self._wandb_defined_metrics:
+                continue
+            wandb.define_metric(key, step_metric="training_step")
+            self._wandb_defined_metrics.add(key)
+
+    def _route_metrics_and_collect_non_costs(
+        self, metrics: dict[str, float], split: str
+    ) -> dict[str, float]:
+        non_cost_metrics: dict[str, float] = {}
+        for metric, value in metrics.items():
+            numeric_value = float(value)
+            if metric.startswith("costs/"):
+                self._metrics_builder.add_cost(metric[len("costs/") :], numeric_value)
+                continue
+            if metric.startswith("costs_"):
+                raise ValueError(
+                    "Legacy cost keys like 'costs_prefill' are no longer supported. "
+                    "Log hierarchical costs like 'costs/train/prefill' or "
+                    "'costs/eval/prefill' instead."
+                )
+            if is_builder_managed_metric(metric):
+                self._metrics_builder.add_metric(metric, numeric_value)
+                continue
+            non_cost_metrics[metric] = numeric_value
+        return non_cost_metrics
+
+    def _collect_automatic_backend_metrics(
         self,
+        *,
         split: str,
         step: int,
+        provided_metric_keys: set[str],
+    ) -> dict[str, float]:
+        if split != "train" or self._backend is None:
+            return {}
+
+        supports_step_metrics = getattr(
+            self._backend, "supports_automatic_train_step_metrics", None
+        )
+        if not callable(supports_step_metrics) or not supports_step_metrics():
+            return {}
+
+        if self._last_local_train_step == step:
+            return {}
+
+        now = time.monotonic()
+        step_wall_s = max(0.0, now - self._last_local_train_log_monotonic)
+        object.__setattr__(self, "_last_local_train_log_monotonic", now)
+        object.__setattr__(self, "_last_local_train_step", step)
+
+        automatic_metrics: dict[str, float] = {}
+        if "time/step_wall_s" not in provided_metric_keys:
+            automatic_metrics["time/step_wall_s"] = step_wall_s
+
+        gpu_cost_getter = getattr(
+            self._backend, "automatic_gpu_cost_per_hour_usd", None
+        )
+        if callable(gpu_cost_getter) and "costs/gpu" not in provided_metric_keys:
+            gpu_cost_per_hour_usd = gpu_cost_getter(self)
+            if gpu_cost_per_hour_usd is not None:
+                automatic_metrics["costs/gpu"] = (
+                    step_wall_s * float(gpu_cost_per_hour_usd) / 3600.0
+                )
+
+        return automatic_metrics
+
+    def _add_default_step_metrics(
+        self,
+        trajectory_groups: list[TrajectoryGroup],
         *,
-        cost_components: dict[str, float],
-        cost_total_direct: float,
-        cost_seen: bool,
-    ) -> None:
-        component_total = sum(cost_components.values())
-        step_total = component_total if component_total > 0 else cost_total_direct
-        if not cost_seen or step_total <= 0:
+        split: str,
+        provided_metric_keys: set[str],
+    ) -> dict[str, float]:
+        if split not in METRIC_SPLITS:
+            return {}
+
+        summary = summarize_trajectory_groups(trajectory_groups)
+        default_data_metrics = build_data_metrics_from_summary(
+            summary,
+            include_trainable_groups=split == "train",
+        )
+        for key, value in default_data_metrics.items():
+            if key in provided_metric_keys:
+                continue
+            self._metrics_builder.add_metric(key, value)
+
+        if summary.scenario_ids:
+            self._metrics_builder.add_data(scenario_ids=summary.scenario_ids)
+
+        if split != "train":
+            return {}
+
+        default_train_metrics = build_train_metrics_from_summary(summary)
+        return {
+            key: value
+            for key, value in default_train_metrics.items()
+            if key not in provided_metric_keys
+        }
+
+    def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder:
+        self._load_metrics_builder_state()
+        if cost_context is None:
+            return self._metrics_builder
+        return self._metrics_builder.for_cost_context(cost_context)
+
+    def activate_metrics_context(self, cost_context: str) -> Token[MetricsBuilder]:
+        return self.metrics_builder(cost_context).activate()
+
+    def _load_metrics_builder_state(self) -> None:
+        if self._metrics_builder_state_loaded:
             return
+        state = self.read_state() or {}
+        metrics_state = state.get(METRICS_BUILDER_STATE_KEY)
+        if isinstance(metrics_state, dict):
+            self._metrics_builder.load_state_dict(metrics_state)
+        object.__setattr__(self, "_metrics_builder_state_loaded", True)
+
+    def _persist_metrics_builder_state(self) -> None:
+        self.merge_state(
+            {METRICS_BUILDER_STATE_KEY: self._metrics_builder.state_dict()}
+        )
 
-        async with self._costs_lock:
-            existing_state = self.read_state() or {}
-            raw_costs = existing_state.get(COSTS_STATE_KEY) or {}
-            cumulative = {
-                key: float(value)
-                for key, value in raw_costs.items()
-                if isinstance(value, (int, float))
-            }
-            last_steps = raw_costs.get("_last_steps")
-            if not isinstance(last_steps, dict):
-                last_steps = {}
-            last_step = last_steps.get(split)
-
-            if isinstance(last_step, (int, float)) and int(last_step) >= step:
-                for component, value in cost_components.items():
-                    if value == 0:
-                        continue
-                    cumulative_key = f"{split}_{component}"
-                    cumulative[cumulative_key] = max(
-                        cumulative.get(cumulative_key, 0.0), value
-                    )
-                cumulative[split] = max(cumulative.get(split, 0.0), step_total)
-                cumulative["total"] = max(
-                    cumulative.get("total", 0.0), cumulative.get(split, 0.0)
-                )
-                self.merge_state(
-                    {COSTS_STATE_KEY: {**cumulative, "_last_steps": last_steps}}
-                )
-                self._log_metrics(cumulative, "costs", step)
-                return
-
-            for component, value in cost_components.items():
-                if value == 0:
-                    continue
-                cumulative_key = f"{split}_{component}"
-                cumulative[cumulative_key] = cumulative.get(cumulative_key, 0.0) + value
-            cumulative[split] = cumulative.get(split, 0.0) + step_total
-            cumulative["total"] = cumulative.get("total", 0.0) + step_total
-            last_steps[split] = step
-            self.merge_state(
-                {COSTS_STATE_KEY: {**cumulative, "_last_steps": last_steps}}
-            )
-            self._log_metrics(cumulative, "costs", step)
+    def _normalize_trajectory_groups(
+        self,
+        trajectories: Iterable[Trajectory | BaseException] | Iterable[TrajectoryGroup],
+    ) -> list[TrajectoryGroup]:
+        items = list(trajectories)
+        if not items:
+            return []
+
+        if all(isinstance(item, TrajectoryGroup) for item in items):
+            return cast(list[TrajectoryGroup], items)
+
+        if all(isinstance(item, (Trajectory, BaseException)) for item in items):
+            return [TrajectoryGroup(cast(Iterable[Trajectory | BaseException], items))]
+
+        raise TypeError(
+            "trajectories must be an iterable of TrajectoryGroup objects or "
+            "an iterable of Trajectory/BaseException items"
+        )
 
     async def log(
         self,
@@ -506,58 +669,45 @@ async def log(
         if step is None:
             step = await self.get_step() if self.trainable else 0
 
+        self._load_metrics_builder_state()
+
         # If only metrics provided (no trajectories), just log them and return
         if trajectories is None:
             if metrics is not None:
-                cost_step = await self.get_step()
-                cost_components: dict[str, float] = {}
-                cost_total_direct = 0.0
-                cost_seen = False
-
-                for metric, value in metrics.items():
-                    if not isinstance(value, (int, float)):
-                        continue
-                    if metric == COSTS_TOTAL_KEY:
-                        raise ValueError(
-                            "Do not log 'costs_total' directly. Log costs_* components "
-                            "(e.g., costs_prefill, costs_sample) and totals are derived."
-                        )
-                    elif metric.startswith(COSTS_METRIC_PREFIX):
-                        component = metric[len(COSTS_METRIC_PREFIX) :]
-                        if component:
-                            cost_components[component] = cost_components.get(
-                                component, 0.0
-                            ) + float(value)
-                            cost_seen = True
-
-                metrics_without_costs = {
-                    key: value
-                    for key, value in metrics.items()
-                    if not key.startswith(COSTS_METRIC_PREFIX)
-                }
-                if metrics_without_costs:
-                    self._log_metrics(metrics_without_costs, split, step)
-
-                await self._record_costs(
-                    split,
-                    cost_step,
-                    cost_components=cost_components,
-                    cost_total_direct=cost_total_direct,
-                    cost_seen=cost_seen,
+                provided_metric_keys = set(metrics)
+                automatic_metrics = self._collect_automatic_backend_metrics(
+                    split=split,
+                    step=step,
+                    provided_metric_keys=provided_metric_keys,
+                )
+                if automatic_metrics:
+                    self._route_metrics_and_collect_non_costs(automatic_metrics, split)
+                metrics_without_costs = self._route_metrics_and_collect_non_costs(
+                    metrics, split
                 )
+                builder_metrics = await self._metrics_builder.flush()
+                merged_metrics = {**metrics_without_costs, **builder_metrics}
+                if merged_metrics:
+                    self._log_metrics(merged_metrics, split, step)
+                self._persist_metrics_builder_state()
             return
 
-        # Convert to list[TrajectoryGroup]
-        if any(isinstance(t, Trajectory) for t in trajectories) or any(
-            isinstance(t, BaseException) for t in trajectories
-        ):
-            trajectory_groups = [
-                TrajectoryGroup(
-                    cast(Iterable[Trajectory | BaseException], trajectories)
-                )
-            ]
-        else:
-            trajectory_groups = cast(list[TrajectoryGroup], list(trajectories))
+        trajectory_groups = self._normalize_trajectory_groups(trajectories)
+        provided_metric_keys = set(metrics or {})
+
+        automatic_metrics = self._collect_automatic_backend_metrics(
+            split=split,
+            step=step,
+            provided_metric_keys=provided_metric_keys,
+        )
+        if automatic_metrics:
+            self._route_metrics_and_collect_non_costs(automatic_metrics, split)
+
+        default_train_metrics = self._add_default_step_metrics(
+            trajectory_groups,
+            split=split,
+            provided_metric_keys=provided_metric_keys,
+        )
 
         # Ensure output directories exist
         output_dir = self._get_output_dir()
@@ -571,59 +721,53 @@ async def log(
         )
 
         # 2. Calculate aggregate metrics (excluding additive costs)
-        cost_step = await self.get_step()
-        all_metrics: dict[str, list[float]] = {"reward": [], "exception_rate": []}
+        reward_key = "reward/mean" if split == "train" else "reward"
+        exception_rate_key = (
+            "reward/exception_rate" if split == "train" else "exception_rate"
+        )
+        reward_std_dev_key = "reward/std_dev" if split == "train" else "reward_std_dev"
+
+        all_metrics: dict[str, list[float]] = {
+            reward_key: [],
+            exception_rate_key: [],
+        }
         group_metrics: dict[str, list[float]] = {}
-        cost_components: dict[str, float] = {}
-        cost_total_direct = 0.0
-        cost_seen = False
-
-        def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None:
-            nonlocal cost_total_direct, cost_seen
-            for metric, value in metrics_dict.items():
-                if not isinstance(value, (int, float)):
-                    continue
-                if metric == COSTS_TOTAL_KEY:
-                    raise ValueError(
-                        "Do not log 'costs_total' directly. Log costs_* components "
-                        "(e.g., costs_prefill, costs_sample) and totals are derived."
-                    )
-                elif metric.startswith(COSTS_METRIC_PREFIX):
-                    component = metric[len(COSTS_METRIC_PREFIX) :]
-                    if component:
-                        cost_components[component] = cost_components.get(
-                            component, 0.0
-                        ) + float(value)
-                        cost_seen = True
 
         for group in trajectory_groups:
             if group.metrics:
-                _add_costs(group.metrics)
+                group_non_cost = self._route_metrics_and_collect_non_costs(
+                    cast(dict[str, float], group.metrics), split
+                )
+            else:
+                group_non_cost = {}
             if group.trajectories:
-                for metric, value in group.metrics.items():
-                    if metric.startswith(COSTS_METRIC_PREFIX):
-                        continue
+                for metric, value in group_non_cost.items():
                     if metric not in group_metrics:
                         group_metrics[metric] = []
                     group_metrics[metric].append(float(value))
-            for trajectory in group:
-                if isinstance(trajectory, BaseException):
-                    all_metrics["exception_rate"].append(1)
-                    continue
-                else:
-                    all_metrics["exception_rate"].append(0)
-                # Add reward metric
-                all_metrics["reward"].append(trajectory.reward)
+
+            all_metrics[exception_rate_key].extend(0.0 for _ in group.trajectories)
+            all_metrics[exception_rate_key].extend(1.0 for _ in group.exceptions)
+
+            for trajectory in group.trajectories:
+                all_metrics[reward_key].append(trajectory.reward)
 
                 # Collect other custom metrics
+                trajectory_metrics: dict[str, float] = {}
                 for metric, value in trajectory.metrics.items():
-                    if metric.startswith(COSTS_METRIC_PREFIX):
-                        continue
+                    routed_metric = metric
+                    if split == "train" and "/" not in routed_metric:
+                        routed_metric = f"reward/{routed_metric}"
+                    trajectory_metrics[routed_metric] = float(value)
+
+                non_cost_trajectory_metrics = self._route_metrics_and_collect_non_costs(
+                    trajectory_metrics,
+                    split,
+                )
+                for metric, value in non_cost_trajectory_metrics.items():
                     if metric not in all_metrics:
                         all_metrics[metric] = []
                     all_metrics[metric].append(float(value))
-                if trajectory.metrics:
-                    _add_costs(trajectory.metrics)
 
         # Calculate averages for all metrics
         averages: dict[str, float] = {}
@@ -631,39 +775,38 @@ def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None:
             if len(values) > 0:
                 averages[metric] = sum(values) / len(values)
 
+        averages.update(default_train_metrics)
+
         # Aggregate group-level metrics once per group
         for metric, values in group_metrics.items():
             if len(values) > 0:
-                averages[f"group_metric_{metric}"] = sum(values) / len(values)
+                group_key = (
+                    f"reward/group_{metric}"
+                    if split == "train"
+                    else f"group_metric_{metric}"
+                )
+                averages[group_key] = sum(values) / len(values)
 
         # Calculate average standard deviation of rewards within groups
         from .utils.old_benchmarking.calculate_step_metrics import (
             calculate_step_std_dev,
         )
 
-        averages["reward_std_dev"] = calculate_step_std_dev(trajectory_groups)
+        averages[reward_std_dev_key] = calculate_step_std_dev(trajectory_groups)
 
         # Merge in any additional metrics passed directly
         if metrics is not None:
-            _add_costs(metrics)
-            metrics_without_costs = {
-                key: value
-                for key, value in metrics.items()
-                if not key.startswith(COSTS_METRIC_PREFIX)
-            }
+            metrics_without_costs = self._route_metrics_and_collect_non_costs(
+                metrics, split
+            )
             averages.update(metrics_without_costs)
 
-        # 3. Log metrics (writes to history.jsonl and wandb)
-        self._log_metrics(averages, split, step)
-
-        # 4. Log cumulative costs (additive)
-        await self._record_costs(
-            split,
-            cost_step,
-            cost_components=cost_components,
-            cost_total_direct=cost_total_direct,
-            cost_seen=cost_seen,
-        )
+        # 3. Merge in any builder-managed metrics and log a single row.
+        builder_metrics = await self._metrics_builder.flush()
+        merged_metrics = {**averages, **builder_metrics}
+        if merged_metrics:
+            self._log_metrics(merged_metrics, split, step)
+        self._persist_metrics_builder_state()
 
     async def get_step(self) -> int:
         """
@@ -714,7 +857,6 @@ def __init__(
             report_metrics=report_metrics,
             **kwargs,
         )
-        object.__setattr__(self, "_costs_lock", asyncio.Lock())
         object.__setattr__(self, "_cost_calculator", self._noop_cost_calculator)
         if _internal_config is not None:
             # Bypass BaseModel __setattr__ to allow setting private attr
@@ -733,7 +875,9 @@ def set_cost_calculator(self, calculator: CostCalculator | None) -> None:
 
     @staticmethod
     def _noop_cost_calculator(
-        _prompt_tokens: int | None, _completion_tokens: int | None
+        _prompt_tokens: int | None,
+        _completion_tokens: int | None,
+        _cost_context: str,
     ) -> dict[str, float]:
         return {}
 
@@ -881,6 +1025,7 @@ async def train(
 
         # 1. Train (backend no longer logs internally)
         training_metrics: list[dict[str, float]] = []
+        trainer_started = time.monotonic()
         async for metrics in self.backend()._train_model(
             self,
             groups_list,
@@ -889,16 +1034,11 @@ async def train(
             verbose,
         ):
             training_metrics.append(metrics)
+        trainer_elapsed = time.monotonic() - trainer_started
 
         # 2. Calculate aggregated training metrics
-        avg_metrics: dict[str, float] = {}
-        if training_metrics:
-            avg_metrics = {
-                k: sum(d.get(k, 0) for d in training_metrics)
-                / sum(1 for d in training_metrics if k in d)
-                for k in {k for d in training_metrics for k in d}
-                if k != "num_gradient_steps"
-            }
+        avg_metrics = average_metric_samples(training_metrics)
+        avg_metrics.setdefault("time/step_trainer_s", trainer_elapsed)
 
         # 3. Log trajectories and training metrics together (single wandb log call)
         step = await self.get_step()
@@ -929,6 +1069,7 @@ async def train_sft(
         # Collect all metrics and aggregate them at the end (same as RL)
         _config = _config or {}  # ty:ignore[invalid-assignment]
         training_metrics: list[dict[str, float]] = []
+        trainer_started = time.monotonic()
         async for metrics in self.backend()._train_sft(
             self,
             trajectories,
@@ -937,14 +1078,14 @@ async def train_sft(
             verbose,
         ):
             training_metrics.append(metrics)
+        trainer_elapsed = time.monotonic() - trainer_started
 
         # Log aggregated training metrics once (same as RL)
         if training_metrics:
-            avg_metrics = {
-                k: sum(d.get(k, 0) for d in training_metrics)
-                / sum(1 for d in training_metrics if k in d)
-                for k in {k for d in training_metrics for k in d}
-            }
+            avg_metrics = average_metric_samples(training_metrics)
+            avg_metrics["time/step_trainer_s"] = trainer_elapsed
             # Get the current step after training
             step = await self.get_step()
-            self._log_metrics(avg_metrics, "train", step)
+            await self.log(
+                trajectories=None, split="train", metrics=avg_metrics, step=step
+            )
diff --git a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
index 52c829750..f9593c240 100644
--- a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
+++ b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
@@ -159,12 +159,12 @@ def print_history_summary(model: art.TrainableModel, tail: int = 5) -> None:
 
     rows = pl.read_ndjson(str(history_path)).to_dicts()
 
-    train_rows = [row for row in rows if "train/reward" in row]
+    train_rows = [row for row in rows if "reward/mean" in row]
     print("\nRecent training metrics:")
     for row in train_rows[-tail:]:
         step = row["step"]
-        reward = row["train/reward"]
-        std_dev = row["train/reward_std_dev"]
+        reward = row["reward/mean"]
+        std_dev = row["reward/std_dev"]
         discarded = row["train/discarded_stale_samples"]
         off_policy = row["train/steps_off_policy"]
         print(
@@ -229,7 +229,9 @@ async def main() -> None:
     openai_client = model.openai_client()
     cost_calculator = model.cost_calculator
 
-    async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory:
+    async def do_rollout(
+        scenario: Scenario, temp: float, cost_context: str
+    ) -> art.Trajectory:
         """Core rollout logic used by both training and eval."""
         messages: art.Messages = scenario["messages"]
         response = await openai_client.chat.completions.create(
@@ -265,6 +267,7 @@ async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory:
         sample_costs = cost_calculator(
             prompt_tokens,
             completion_tokens,
+            cost_context,
         )
         if sample_costs:
             metrics.update(sample_costs)
@@ -281,7 +284,7 @@ async def single_rollout(
         scenario: Scenario,
         _config: PipelineConfig,
     ) -> art.Trajectory:
-        return await do_rollout(scenario, temperature)
+        return await do_rollout(scenario, temperature, "train")
 
     rollout_fn = make_group_rollout_fn(single_rollout, n=rollouts_per_scenario)
 
@@ -290,7 +293,7 @@ async def single_rollout(
     async def eval_fn(
         _model: art.TrainableModel, _step: int, _config: PipelineConfig
     ) -> list[art.Trajectory]:
-        tasks = [do_rollout(build_scenario(), eval_temperature)]
+        tasks = [do_rollout(build_scenario(), eval_temperature, "eval")]
         results = await asyncio.gather(*tasks, return_exceptions=True)
         trajectories = [r for r in results if isinstance(r, art.Trajectory)]
         if trajectories:
@@ -312,7 +315,7 @@ def build_scenario() -> Scenario:
     async def scenario_iter():
         for i in range(scenario_count):
             scenario = build_scenario()
-            scenario["metadata"] = {"scenario_idx": i}
+            scenario["metadata"] = {"scenario_id": str(i)}
             yield scenario
 
     config = PipelineConfig(
diff --git a/src/art/pipeline_trainer/trainer.py b/src/art/pipeline_trainer/trainer.py
index a061636b5..5d569277a 100644
--- a/src/art/pipeline_trainer/trainer.py
+++ b/src/art/pipeline_trainer/trainer.py
@@ -16,6 +16,8 @@
 from .types import ConfigT, EvalFn, RolloutFn, ScenarioT, SingleRolloutFn  # noqa: F401
 
 PIPELINE_STATE_KEY = "_pipeline_trainer"
+_ROLLOUT_WALL_TIME_KEY = "_art_rollout_wall_s"
+_ACTOR_IDLE_TIME_KEY = "_art_actor_idle_s"
 
 
 def _to_async_iterator(iterable: Iterable[T] | AsyncIterator[T]) -> AsyncIterator[T]:
@@ -322,13 +324,21 @@ async def _rollout_worker(self, worker_id: int) -> None:
             self._status.note_rollout_started()
             errored = False
             try:
+                wait_started = time.monotonic()
                 await self._wait_for_policy()
+                actor_idle_s = time.monotonic() - wait_started
                 if self.state.done:
                     break
 
                 initial_version = self.state.policy_version
 
-                group = await self.rollout_fn(self.model, scenario, self.config)
+                token = self.model.activate_metrics_context("train")
+                rollout_started = time.monotonic()
+                try:
+                    group = await self.rollout_fn(self.model, scenario, self.config)
+                finally:
+                    token.var.reset(token)
+                rollout_wall_s = time.monotonic() - rollout_started
                 if not isinstance(group, TrajectoryGroup):
                     errored = True
                     continue
@@ -340,7 +350,9 @@ async def _rollout_worker(self, worker_id: int) -> None:
                 )
                 if self.state.done:
                     break
-                await self._put_output_group(group)
+                queue_wait_s = await self._put_output_group(group)
+                group.metadata[_ROLLOUT_WALL_TIME_KEY] = rollout_wall_s
+                group.metadata[_ACTOR_IDLE_TIME_KEY] = actor_idle_s + queue_wait_s
             except asyncio.CancelledError:
                 raise
             except Exception as exc:
@@ -379,13 +391,17 @@ async def _training_stage(self) -> None:
             if stop_at_step is not None and current_step >= stop_at_step:
                 break
             step_start = time.monotonic()
+            collect_started = time.monotonic()
             batch, discarded, saw_sentinel = await self._collect_batch(current_step)
+            trainer_idle_s = time.monotonic() - collect_started
             self.state.discarded_stale_samples += discarded
             if discarded:
                 self._status.note_stale(discarded)
             if not batch:
                 break
 
+            actor_wall_s, actor_idle_s = self._consume_batch_rollout_timings(batch)
+
             expected_step = current_step + 1
             should_eval_step = self._should_eval_step(expected_step)
             should_checkpoint = self.save_checkpoint and should_eval_step
@@ -395,10 +411,9 @@ async def _training_stage(self) -> None:
                 self.state.policy_updated.notify_all()
 
             self._status.note_training_start(len(batch))
-            train_call_start: float | None = None
+            train_call_start = time.monotonic()
             if os.getenv("ART_TRAIN_STEP_LOG"):
                 print(f"[train] step {expected_step} starting (batch={len(batch)})")
-                train_call_start = time.perf_counter()
             try:
                 result = await self.backend.train(
                     self.model,
@@ -414,8 +429,8 @@ async def _training_stage(self) -> None:
                 self._status.note_training_end()
                 raise
             finally:
-                if train_call_start is not None:
-                    train_call_elapsed = time.perf_counter() - train_call_start
+                train_call_elapsed = time.monotonic() - train_call_start
+                if os.getenv("ART_TRAIN_STEP_LOG"):
                     print(
                         f"[train] step {expected_step} done in "
                         f"{train_call_elapsed:.1f}s"
@@ -438,7 +453,14 @@ async def _training_stage(self) -> None:
                     ),
                     "steps_off_policy": steps_off_policy,
                     "num_groups": float(len(batch)),
+                    "time/step_wall_s": step_seconds,
+                    "throughput/step_trainer_idle_s": trainer_idle_s,
                 }
+                metrics.setdefault("time/step_trainer_s", train_call_elapsed)
+                if actor_wall_s > 0:
+                    metrics["time/step_actor_s"] = actor_wall_s
+                if actor_idle_s > 0:
+                    metrics["throughput/step_actor_idle_s"] = actor_idle_s
                 metrics.update(result.metrics)
 
                 await self.model.log(
@@ -561,14 +583,22 @@ async def _run_eval(self, step: int) -> None:
         assert self.eval_fn is not None
         self._status.note_val_started(step)
         reward: float | None = None
+        eval_elapsed = 0.0
         try:
-            result = await self.eval_fn(self.model, step, self.config)
+            token = self.model.activate_metrics_context("eval")
+            eval_started = time.monotonic()
+            try:
+                result = await self.eval_fn(self.model, step, self.config)
+            finally:
+                token.var.reset(token)
+                eval_elapsed = time.monotonic() - eval_started
             splits: dict[str, list[art.Trajectory | art.TrajectoryGroup]]
             if isinstance(result, dict):
                 splits = result
             else:
                 splits = {"val": result}
 
+            logged_eval_timing = False
             for split_name, items in splits.items():
                 groups, trajectories = self._normalize_eval_items(items)
                 if split_name == "val":
@@ -577,7 +607,25 @@ async def _run_eval(self, step: int) -> None:
                     else:
                         reward = None
                 if groups:
-                    await self.model.log(groups, split=split_name, step=step)
+                    metrics = (
+                        {"time/step_eval_s": eval_elapsed}
+                        if not logged_eval_timing
+                        else None
+                    )
+                    await self.model.log(
+                        groups,
+                        split=split_name,
+                        step=step,
+                        metrics=metrics,
+                    )
+                    logged_eval_timing = True
+            if not logged_eval_timing and eval_elapsed > 0:
+                await self.model.log(
+                    trajectories=None,
+                    split="val",
+                    step=step,
+                    metrics={"time/step_eval_s": eval_elapsed},
+                )
         except asyncio.CancelledError:
             raise
         except Exception as exc:
@@ -630,6 +678,9 @@ def _apply_scenario_metadata(
                 continue
             if not self._is_scalar_metadata(value):
                 continue
+            if key == "scenario_id":
+                group.metadata["scenario_id"] = value
+                continue
             group.metadata[f"scenario_{key}"] = value
 
     def _is_group_stale(self, group: TrajectoryGroup, min_version: int) -> bool:
@@ -734,12 +785,31 @@ def _persist_state(self, training_step: int) -> None:
     def _is_scalar_metadata(value: object) -> bool:
         return value is None or isinstance(value, (str, int, float, bool))
 
-    async def _put_output_group(self, group: TrajectoryGroup) -> None:
+    async def _put_output_group(self, group: TrajectoryGroup) -> float:
         assert self._output_queue is not None
+        queue_wait_started = time.monotonic()
         while not self.state.done:
             try:
                 await asyncio.wait_for(self._output_queue.put(group), timeout=1.0)
                 self._status.note_group_enqueued(group)
-                return
+                return time.monotonic() - queue_wait_started
             except asyncio.TimeoutError:
                 continue
+        return time.monotonic() - queue_wait_started
+
+    def _consume_batch_rollout_timings(
+        self, batch: list[TrajectoryGroup]
+    ) -> tuple[float, float]:
+        rollout_wall_s = 0.0
+        actor_idle_s = 0.0
+        for group in batch:
+            rollout_wall_s += self._pop_float_metadata(group, _ROLLOUT_WALL_TIME_KEY)
+            actor_idle_s += self._pop_float_metadata(group, _ACTOR_IDLE_TIME_KEY)
+        return rollout_wall_s, actor_idle_s
+
+    @staticmethod
+    def _pop_float_metadata(group: TrajectoryGroup, key: str) -> float:
+        value = group.metadata.pop(key, 0.0)
+        if isinstance(value, (int, float)):
+            return float(value)
+        return 0.0
diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py
index dea0198e7..fcb9f68fb 100644
--- a/src/art/serverless/backend.py
+++ b/src/art/serverless/backend.py
@@ -1,4 +1,5 @@
 import asyncio
+import time
 from typing import TYPE_CHECKING, Any, AsyncIterator, Iterable, Literal
 import warnings
 
@@ -9,6 +10,12 @@
 
 from .. import dev
 from ..backend import AnyTrainableModel, Backend
+from ..metrics_taxonomy import (
+    TRAIN_GRADIENT_STEPS_KEY,
+    average_metric_samples,
+    build_training_summary_metrics,
+    summarize_trajectory_groups,
+)
 from ..trajectories import Trajectory, TrajectoryGroup
 from ..types import ServerlessTrainResult, TrainConfig, TrainSFTConfig
 from ..utils.record_provenance import record_provenance
@@ -30,6 +37,44 @@ def _extract_step_from_wandb_artifact(artifact: "wandb.Artifact") -> int | None:
     return None
 
 
+_UPSTREAM_TRAIN_METRIC_KEYS = {
+    "reward": "reward/mean",
+    "reward_std_dev": "reward/std_dev",
+    "exception_rate": "reward/exception_rate",
+    "policy_loss": "loss/train",
+    "loss": "loss/train",
+    "entropy": "loss/entropy",
+    "kl_div": "loss/kl_div",
+    "kl_policy_ref": "loss/kl_policy_ref",
+    "grad_norm": "loss/grad_norm",
+    "learning_rate": "loss/learning_rate",
+    "num_groups_submitted": "train/num_groups_submitted",
+    "num_groups_trainable": "train/num_groups_trainable",
+    "num_trajectories": "train/num_trajectories",
+    "num_trainable_tokens": "train/num_trainable_tokens",
+    "train_tokens": "data/step_trainer_tokens",
+    "num_datums": "data/step_num_datums",
+}
+
+
+def _canonicalize_upstream_metric_key(metric: str) -> str:
+    if "/" in metric:
+        return metric
+    if metric == "tokens_per_second":
+        return ""
+    if metric.startswith("group_metric_"):
+        return f"reward/group_{metric[len('group_metric_') :]}"
+    return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
+
+
+def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]:
+    return {
+        canonical_key: float(value)
+        for key, value in metrics.items()
+        if (canonical_key := _canonicalize_upstream_metric_key(key))
+    }
+
+
 class ServerlessBackend(Backend):
     def __init__(
         self, *, api_key: str | None = None, base_url: str | None = None
@@ -233,20 +278,28 @@ async def train(  # type: ignore[override]
 
         # Collect metrics from training
         training_metrics: list[dict[str, float]] = []
+        trainer_started = time.monotonic()
         async for metrics in self._train_model(
             model, groups_list, config, dev_config, verbose
         ):
             training_metrics.append(metrics)
 
         # Aggregate metrics
-        avg_metrics: dict[str, float] = {}
-        if training_metrics:
-            avg_metrics = {
-                k: sum(d.get(k, 0) for d in training_metrics)
-                / sum(1 for d in training_metrics if k in d)
-                for k in {k for d in training_metrics for k in d}
-                if k != "num_gradient_steps"
+        avg_metrics = average_metric_samples(training_metrics)
+        summary = summarize_trajectory_groups(groups_list)
+        avg_metrics.setdefault(
+            "time/step_trainer_s", time.monotonic() - trainer_started
+        )
+        avg_metrics.update(
+            {
+                key: value
+                for key, value in build_training_summary_metrics(
+                    summary,
+                    include_trainable_groups=True,
+                ).items()
+                if key not in avg_metrics
             }
+        )
 
         # Get step and artifact name
         step = await self._get_step(model)
@@ -273,6 +326,11 @@ async def _train_model(
         dev_config: dev.TrainConfig,
         verbose: bool = False,
     ) -> AsyncIterator[dict[str, float]]:
+        summary = summarize_trajectory_groups(trajectory_groups)
+        base_metrics = build_training_summary_metrics(
+            summary,
+            include_trainable_groups=True,
+        )
         assert model.id is not None, "Model ID is required"
         training_job = await self._client.training_jobs.create(  # ty:ignore[possibly-missing-attribute]
             model_id=model.id,
@@ -305,7 +363,14 @@ async def _train_model(
                     assert pbar is not None and num_sequences is not None
                     pbar.update(1)
                     pbar.set_postfix(event.data)
-                    yield {**event.data, "num_gradient_steps": num_sequences}
+                    metrics = _canonicalize_upstream_metrics(
+                        {k: float(v) for k, v in event.data.items()}
+                    )
+                    yield {
+                        **base_metrics,
+                        **metrics,
+                        TRAIN_GRADIENT_STEPS_KEY: float(num_sequences),
+                    }
                 elif event.type == "training_started":
                     num_sequences = event.data["num_sequences"]
                     if pbar is None:
@@ -470,7 +535,15 @@ async def _train_sft(
                     assert pbar is not None and num_batches is not None
                     pbar.update(1)
                     pbar.set_postfix(event.data)
-                    yield {**event.data, "num_gradient_steps": num_batches}
+                    metrics = _canonicalize_upstream_metrics(
+                        {k: float(v) for k, v in event.data.items()}
+                    )
+                    yield {
+                        **metrics,
+                        "data/step_num_trajectories": float(num_trajectories),
+                        "train/num_trajectories": float(num_trajectories),
+                        TRAIN_GRADIENT_STEPS_KEY: float(num_batches),
+                    }
                 elif event.type == "training_started":
                     num_batches = event.data.get("num_sequences", 0)
                     if pbar is None:
diff --git a/src/art/tinker/service.py b/src/art/tinker/service.py
index ba6768eb8..1f5970aca 100644
--- a/src/art/tinker/service.py
+++ b/src/art/tinker/service.py
@@ -80,7 +80,7 @@ def custom_loss_fn(
             for mask, lp in zip(masks, logprobs_list):
                 logprobs[mask] = lp
             loss = loss_fn(inputs, logprobs.unsqueeze(0), None, None, _config)
-            return loss.mean_policy_loss, {"policy_loss": loss.mean_policy_loss.item()}
+            return loss.mean_policy_loss, {"loss/train": loss.mean_policy_loss.item()}
 
         shifted_tokens = shift_tensor(packed_tensors["tokens"], 0)
 
diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py
index e5eb1180e..500a850fa 100644
--- a/src/art/tinker_native/backend.py
+++ b/src/art/tinker_native/backend.py
@@ -30,6 +30,10 @@
 from .. import dev
 from ..backend import Backend
 from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing
+from ..metrics_taxonomy import (
+    build_training_summary_metrics,
+    summarize_trajectory_groups,
+)
 from ..model import Model, TrainableModel
 from ..tinker.backend import get_renderer_name
 from ..tinker.server import get_free_port
@@ -47,6 +51,35 @@
 STATE_KEY_LATEST_STEP = "latest_step"
 T = TypeVar("T")
 
+_UPSTREAM_TRAIN_METRIC_KEYS = {
+    "reward": "reward/mean",
+    "reward_std_dev": "reward/std_dev",
+    "exception_rate": "reward/exception_rate",
+    "policy_loss": "loss/train",
+    "loss": "loss/train",
+    "entropy": "loss/entropy",
+    "kl_div": "loss/kl_div",
+    "kl_policy_ref": "loss/kl_policy_ref",
+    "grad_norm": "loss/grad_norm",
+    "learning_rate": "loss/learning_rate",
+    "num_groups_submitted": "train/num_groups_submitted",
+    "num_groups_trainable": "train/num_groups_trainable",
+    "num_trajectories": "train/num_trajectories",
+    "num_trainable_tokens": "train/num_trainable_tokens",
+    "train_tokens": "data/step_trainer_tokens",
+    "num_datums": "data/step_num_datums",
+}
+
+
+def _canonicalize_upstream_metric_key(metric: str) -> str:
+    if "/" in metric:
+        return metric
+    if metric == "tokens_per_second":
+        return ""
+    if metric.startswith("group_metric_"):
+        return f"reward/group_{metric[len('group_metric_') :]}"
+    return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
+
 
 @dataclass
 class ModelState:
@@ -208,6 +241,7 @@ async def train(  # type: ignore[override]
     ) -> TrainResult:
         state = self._model_state[model.name]
         groups_list = list(trajectory_groups)
+        summary = summarize_trajectory_groups(groups_list)
 
         datums = trajectory_groups_to_datums(
             groups_list,
@@ -217,8 +251,11 @@ async def train(  # type: ignore[override]
         )
 
         metrics: dict[str, float] = {
-            "num_groups_submitted": float(len(groups_list)),
-            "num_datums": float(len(datums)),
+            **build_training_summary_metrics(
+                summary,
+                include_trainable_groups=True,
+            ),
+            "data/step_num_datums": float(len(datums)),
         }
 
         if not datums:
@@ -227,10 +264,13 @@ async def train(  # type: ignore[override]
         train_tokens = 0
         for datum in datums:
             train_tokens += len(datum.model_input.to_ints())
-        metrics["train_tokens"] = float(train_tokens)
+        metrics["data/step_trainer_tokens"] = float(train_tokens)
         pricing = get_model_pricing(model.base_model)
         if pricing is not None:
-            metrics["costs_train"] = compute_train_cost(train_tokens, pricing)
+            metrics["costs/train/tinker_train"] = compute_train_cost(
+                train_tokens, pricing
+            )
+        trainer_started = time.monotonic()
 
         if adam_params is None:
             adam_params = tinker.AdamParams(
@@ -268,12 +308,16 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum:
             for key, value in forward_output.metrics.items():
                 if value is None:
                     continue
-                metrics[key] = float(value)
+                canonical_key = _canonicalize_upstream_metric_key(key)
+                if canonical_key:
+                    metrics[canonical_key] = float(value)
         if optim_output.metrics:
             for key, value in optim_output.metrics.items():
                 if value is None:
                     continue
-                metrics[key] = float(value)
+                canonical_key = _canonicalize_upstream_metric_key(key)
+                if canonical_key:
+                    metrics[canonical_key] = float(value)
 
         next_step = state.current_step + 1
         checkpoint_name = f"step_{next_step:06d}"
@@ -298,6 +342,7 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum:
 
         state.current_step = next_step
         self._persist_model_state(model, state)
+        metrics["time/step_trainer_s"] = time.monotonic() - trainer_started
 
         return TrainResult(step=state.current_step, metrics=metrics)
 
diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py
index fad04fbc3..76ab19911 100644
--- a/src/art/unsloth/service.py
+++ b/src/art/unsloth/service.py
@@ -13,6 +13,7 @@
 from datasets import Dataset
 import peft
 import torch
+from torch.optim import Optimizer
 from transformers import GenerationMixin, PreTrainedModel
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from trl import GRPOConfig, GRPOTrainer
@@ -190,6 +191,13 @@ def save_checkpoint(
     return checkpoint_dir
 
 
+def _get_trainer_optimizer(trainer: GRPOTrainer) -> Optimizer:
+    optimizer = cast(Optimizer | None, getattr(trainer, "optimizer", None))
+    if optimizer is None:
+        raise RuntimeError("Trainer optimizer must be initialized before training")
+    return optimizer
+
+
 # ============================================================================
 # Model Classes
 # ============================================================================
@@ -541,10 +549,11 @@ def _reset_optimizer_if_mode_changed(
         mode_changed = (
             self._last_training_mode is not None and self._last_training_mode != mode
         )
+        optimizer = _get_trainer_optimizer(self._state.trainer)
 
         if mode_changed:
             # Clear all optimizer state (exp_avg, exp_avg_sq, step for each param)
-            self._state.trainer.optimizer.state.clear()
+            optimizer.state.clear()
 
         self._last_training_mode = mode
 
@@ -576,9 +585,10 @@ async def _train_dedicated(
     ) -> AsyncIterator[dict[str, float]]:
         """Train in dedicated mode — no sleep/wake, vLLM keeps running on separate GPU."""
         self._reset_optimizer_if_mode_changed("rl")
+        optimizer = _get_trainer_optimizer(self._state.trainer)
 
         rl_weight_decay = 0.1
-        for param_group in self._state.trainer.optimizer.param_groups:
+        for param_group in optimizer.param_groups:
             param_group["weight_decay"] = rl_weight_decay
 
         packed_tensors = packed_tensors_from_dir(**disk_packed_tensors)
@@ -661,10 +671,11 @@ async def _train_shared(
 
         # Reset optimizer state if switching from SFT to RL
         self._reset_optimizer_if_mode_changed("rl")
+        optimizer = _get_trainer_optimizer(self._state.trainer)
 
         # Set RL-specific hyperparameters
         rl_weight_decay = 0.1
-        for param_group in self._state.trainer.optimizer.param_groups:
+        for param_group in optimizer.param_groups:
             param_group["weight_decay"] = rl_weight_decay
 
         # Load packed tensors
@@ -794,7 +805,7 @@ async def train_sft(
         # Get model and optimizer
         peft_model = self._state.peft_model
         self._reset_optimizer_if_mode_changed("sft")
-        optimizer = self._state.trainer.optimizer
+        optimizer = _get_trainer_optimizer(self._state.trainer)
 
         # Set SFT-specific hyperparameters
         sft_weight_decay = 0.01
@@ -873,12 +884,11 @@ async def train_sft(
             batch_idx += 1
 
             yield {
-                "loss": batch_loss,
-                "learning_rate": batch.learning_rate,
-                "grad_norm": grad_norm,
-                "num_trajectories": float(batch.num_trajectories),
-                "num_trainable_tokens": float(batch.num_trainable_tokens),
-                "tokens_per_second": tokens_per_second,
+                "loss/train": batch_loss,
+                "loss/learning_rate": batch.learning_rate,
+                "loss/grad_norm": grad_norm,
+                "train/num_trajectories": float(batch.num_trajectories),
+                "train/num_trainable_tokens": float(batch.num_trainable_tokens),
             }
 
         # === Cleanup ===
diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py
index fcb7e287a..399c1c728 100644
--- a/src/art/unsloth/train.py
+++ b/src/art/unsloth/train.py
@@ -3,7 +3,7 @@
 from contextlib import contextmanager, nullcontext
 import gc
 import os
-from typing import TYPE_CHECKING, Callable, cast
+from typing import TYPE_CHECKING, Any, Callable, cast
 
 import nest_asyncio
 from peft.peft_model import PeftModel
@@ -19,6 +19,43 @@
 
 nest_asyncio.apply()
 
+_UPSTREAM_TRAIN_METRIC_KEYS = {
+    "reward": "reward/mean",
+    "reward_std_dev": "reward/std_dev",
+    "exception_rate": "reward/exception_rate",
+    "policy_loss": "loss/train",
+    "loss": "loss/train",
+    "entropy": "loss/entropy",
+    "kl_div": "loss/kl_div",
+    "kl_policy_ref": "loss/kl_policy_ref",
+    "grad_norm": "loss/grad_norm",
+    "learning_rate": "loss/learning_rate",
+    "num_groups_submitted": "train/num_groups_submitted",
+    "num_groups_trainable": "train/num_groups_trainable",
+    "num_trajectories": "train/num_trajectories",
+    "num_trainable_tokens": "train/num_trainable_tokens",
+    "train_tokens": "data/step_trainer_tokens",
+    "num_datums": "data/step_num_datums",
+}
+
+
+def _canonicalize_upstream_metric_key(metric: str) -> str:
+    if "/" in metric:
+        return metric
+    if metric == "tokens_per_second":
+        return ""
+    if metric.startswith("group_metric_"):
+        return f"reward/group_{metric[len('group_metric_') :]}"
+    return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric)
+
+
+def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]:
+    return {
+        canonical_key: float(value)
+        for key, value in metrics.items()
+        if (canonical_key := _canonicalize_upstream_metric_key(key))
+    }
+
 
 async def train(
     trainer: "GRPOTrainer",
@@ -169,19 +206,21 @@ def compute_loss(
             _config,
         )
 
-        trainer._metrics["train"]["learning_rate"].append(config.learning_rate)
-        trainer._metrics["train"]["policy_loss"].append(loss.mean_policy_loss.item())
+        trainer._metrics["train"]["loss/learning_rate"].append(config.learning_rate)
+        trainer._metrics["train"]["loss/train"].append(loss.mean_policy_loss.item())
         if loss.mean_entropy is not None:
-            trainer._metrics["train"]["entropy"].append(loss.mean_entropy.item())
+            trainer._metrics["train"]["loss/entropy"].append(loss.mean_entropy.item())
         if loss.kl_policy_ref is not None:
-            trainer._metrics["train"]["kl_policy_ref"].append(loss.kl_policy_ref.item())
+            trainer._metrics["train"]["loss/kl_policy_ref"].append(
+                loss.kl_policy_ref.item()
+            )
         return loss.mean_policy_loss
 
     return compute_loss
 
 
 def get_log_fn(
-    trainer: "GRPOTrainer", results_queue: asyncio.Queue[dict[str, float]]
+    trainer: Any, results_queue: asyncio.Queue[dict[str, float]]
 ) -> Callable[..., None]:
     def log(logs: dict[str, float], start_time: float | None = None) -> None:
         metrics = {
@@ -189,13 +228,18 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None:
         }  # average the metrics
 
         # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
-        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        # start with "eval_". Normalize them into the `val/...` taxonomy instead.
         if next(iter(logs.keys())).startswith("eval_"):
-            metrics = {f"eval_{key}": val for key, val in metrics.items()}
-
-        logs = {**logs, **metrics}
-        logs.pop("learning_rate", None)
-        results_queue.put_nowait(logs)
+            normalized_metrics = {f"val/{key}": val for key, val in metrics.items()}
+            normalized_logs = {
+                f"val/{_canonicalize_upstream_metric_key(key[len('eval_') :])}": val
+                for key, val in logs.items()
+            }
+            results_queue.put_nowait({**normalized_metrics, **normalized_logs})
+        else:
+            results_queue.put_nowait(
+                {**_canonicalize_upstream_metrics(logs), **metrics}
+            )
         trainer._metrics["train"].clear()
 
     return log
diff --git a/tests/integration/test_live_api_cost.py b/tests/integration/test_live_api_cost.py
new file mode 100644
index 000000000..ad7438bee
--- /dev/null
+++ b/tests/integration/test_live_api_cost.py
@@ -0,0 +1,224 @@
+import json
+import os
+from pathlib import Path
+import urllib.request
+from uuid import uuid4
+
+import pytest
+
+from art import Model
+from art.metrics import track_api_cost
+
+pytestmark = pytest.mark.live_api_cost
+
+_LIVE_ENV = "ART_RUN_LIVE_API_COST_TESTS"
+
+
+def _require_live_test_env(*required_vars: str) -> None:
+    if os.environ.get(_LIVE_ENV) != "1":
+        pytest.skip(f"Set {_LIVE_ENV}=1 to run live API cost tests.")
+    missing = [name for name in required_vars if not os.environ.get(name)]
+    if missing:
+        pytest.skip(f"Missing required env vars: {', '.join(missing)}")
+
+
+def _post_json(url: str, *, headers: dict[str, str], payload: dict) -> dict:
+    request = urllib.request.Request(
+        url,
+        data=json.dumps(payload).encode("utf-8"),
+        headers=headers,
+        method="POST",
+    )
+    with urllib.request.urlopen(request, timeout=120) as response:
+        return json.loads(response.read().decode("utf-8"))
+
+
+def _cacheable_prefix(word_count: int = 1500) -> str:
+    return " ".join(f"cache-token-{index % 16}" for index in range(word_count))
+
+
+def _history_rows(history_path: Path) -> list[dict]:
+    return [json.loads(line) for line in history_path.read_text().splitlines() if line]
+
+
+def _openai_completion(*, api_key: str, prompt_cache_key: str, prefix: str) -> dict:
+    return _post_json(
+        "https://api.openai.com/v1/chat/completions",
+        headers={
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        },
+        payload={
+            "model": "gpt-4.1",
+            "messages": [
+                {"role": "system", "content": prefix},
+                {"role": "user", "content": "Reply with OK."},
+            ],
+            "temperature": 0,
+            "max_completion_tokens": 4,
+            "prompt_cache_key": prompt_cache_key,
+        },
+    )
+
+
+def _anthropic_message(*, api_key: str, prefix: str) -> dict:
+    return _post_json(
+        "https://api.anthropic.com/v1/messages",
+        headers={
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+            "content-type": "application/json",
+        },
+        payload={
+            "model": "claude-sonnet-4-6",
+            "max_tokens": 8,
+            "temperature": 0,
+            "system": [
+                {
+                    "type": "text",
+                    "text": prefix,
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+            "messages": [
+                {"role": "user", "content": "Reply with OK."},
+            ],
+        },
+    )
+
+
+class TestLiveApiCost:
+    @pytest.mark.asyncio
+    async def test_openai_gpt_4_1_cached_prompt_cost(self, tmp_path: Path) -> None:
+        _require_live_test_env("OPENAI_API_KEY")
+
+        api_key = os.environ["OPENAI_API_KEY"]
+        prefix = _cacheable_prefix()
+        prompt_cache_key = f"art-live-api-cost-{uuid4()}"
+
+        # Warm the cache first so the tracked request can validate cached pricing.
+        _openai_completion(
+            api_key=api_key,
+            prompt_cache_key=prompt_cache_key,
+            prefix=prefix,
+        )
+
+        model = Model(
+            name="live-openai-api-cost",
+            project="live-api-cost",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        @track_api_cost(
+            source="llm_judge/openai_cached_prompt",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+        )
+        def _judge() -> dict:
+            return _openai_completion(
+                api_key=api_key,
+                prompt_cache_key=prompt_cache_key,
+                prefix=prefix,
+            )
+
+        token = model.activate_metrics_context("eval")
+        try:
+            response = _judge()
+        finally:
+            token.var.reset(token)
+
+        await model.log(trajectories=None, split="val", step=1, metrics={})
+
+        usage = response["usage"]
+        cached_tokens = usage.get("prompt_tokens_details", {}).get("cached_tokens", 0)
+        assert cached_tokens > 0
+
+        expected_cost = (
+            ((usage["prompt_tokens"] - cached_tokens) * 2.0)
+            + (cached_tokens * 0.5)
+            + (usage["completion_tokens"] * 8.0)
+        ) / 1_000_000
+
+        history_path = (
+            tmp_path
+            / "live-api-cost"
+            / "models"
+            / "live-openai-api-cost"
+            / "history.jsonl"
+        )
+        row = _history_rows(history_path)[0]
+        assert row["costs/eval/llm_judge/openai_cached_prompt"] == pytest.approx(
+            expected_cost
+        )
+
+    @pytest.mark.asyncio
+    async def test_anthropic_claude_sonnet_4_6_prompt_cache_cost(
+        self,
+        tmp_path: Path,
+    ) -> None:
+        _require_live_test_env("ANTHROPIC_API_KEY")
+
+        api_key = os.environ["ANTHROPIC_API_KEY"]
+        prefix = _cacheable_prefix()
+
+        model = Model(
+            name="live-anthropic-api-cost",
+            project="live-api-cost",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        @track_api_cost(
+            source="llm_judge/anthropic_prompt_cache",
+            provider="anthropic",
+            model_name="anthropic/claude-sonnet-4-6",
+        )
+        def _judge() -> dict:
+            return _anthropic_message(api_key=api_key, prefix=prefix)
+
+        token = model.activate_metrics_context("eval")
+        try:
+            first_response = _judge()
+        finally:
+            token.var.reset(token)
+        await model.log(trajectories=None, split="val", step=1, metrics={})
+
+        token = model.activate_metrics_context("eval")
+        try:
+            second_response = _judge()
+        finally:
+            token.var.reset(token)
+        await model.log(trajectories=None, split="val", step=2, metrics={})
+
+        first_usage = first_response["usage"]
+        second_usage = second_response["usage"]
+        assert first_usage.get("cache_creation_input_tokens", 0) > 0
+        assert second_usage.get("cache_read_input_tokens", 0) > 0
+
+        first_expected_cost = (
+            (first_usage["input_tokens"] * 3.0)
+            + (first_usage.get("cache_creation_input_tokens", 0) * 3.75)
+            + (first_usage["output_tokens"] * 15.0)
+        ) / 1_000_000
+        second_expected_cost = (
+            (second_usage["input_tokens"] * 3.0)
+            + (second_usage.get("cache_read_input_tokens", 0) * 0.30)
+            + (second_usage["output_tokens"] * 15.0)
+        ) / 1_000_000
+
+        history_path = (
+            tmp_path
+            / "live-api-cost"
+            / "models"
+            / "live-anthropic-api-cost"
+            / "history.jsonl"
+        )
+        first_row, second_row = _history_rows(history_path)
+
+        assert first_row[
+            "costs/eval/llm_judge/anthropic_prompt_cache"
+        ] == pytest.approx(first_expected_cost)
+        assert second_row[
+            "costs/eval/llm_judge/anthropic_prompt_cache"
+        ] == pytest.approx(second_expected_cost)
diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py
index 202785892..2afb8af6e 100644
--- a/tests/unit/test_frontend_logging.py
+++ b/tests/unit/test_frontend_logging.py
@@ -17,6 +17,8 @@
 import pytest
 
 from art import Model, TrainableModel, Trajectory, TrajectoryGroup
+from art.local.backend import LocalBackend
+from art.metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY
 from art.utils.trajectory_logging import read_trajectory_groups_parquet
 
 
@@ -225,13 +227,12 @@ async def test_history_appends_entries(
         history_path = tmp_path / "test-project/models/test-model/history.jsonl"
         df = pl.read_ndjson(str(history_path))
 
-        # Should have 2 entries
         assert len(df) == 2
 
         # Check both splits are present
         columns = df.columns
         assert any("val/" in col for col in columns)
-        assert any("train/" in col for col in columns)
+        assert any("reward/" in col for col in columns)
 
 
 class TestPathStructure:
@@ -337,10 +338,22 @@ async def test_metric_prefixes(self, tmp_path: Path):
             entry = json.loads(f.readline())
 
         # All metrics should be prefixed (except step and recorded_at)
-        metric_keys = [k for k in entry.keys() if k not in ["step", "recorded_at"]]
-        assert all(k.startswith("val/") for k in metric_keys), (
-            f"Not all metrics prefixed: {metric_keys}"
+        metric_keys = [
+            k
+            for k in entry.keys()
+            if k
+            not in [
+                "step",
+                "recorded_at",
+                "training_step",
+                "time/wall_clock_sec",
+            ]
+        ]
+        assert all(k.startswith(("val/", "data/")) for k in metric_keys), (
+            f"Not all metrics routed into taxonomy namespaces: {metric_keys}"
         )
+        assert entry["training_step"] == 0
+        assert entry["time/wall_clock_sec"] >= 0
 
     @pytest.mark.asyncio
     async def test_standard_metrics_present(self, tmp_path: Path):
@@ -455,6 +468,340 @@ async def test_exception_rate_calculation(self, tmp_path: Path):
         # All successful trajectories = 0% exception rate
         assert entry["val/exception_rate"] == 0.0
 
+    @pytest.mark.asyncio
+    async def test_exception_rate_counts_group_exceptions(self, tmp_path: Path):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        trajectory_groups = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.5,
+                        messages_and_choices=[{"role": "user", "content": "test"}],
+                    )
+                ],
+                exceptions=[ValueError("boom")],
+            )
+        ]
+
+        await model.log(trajectory_groups, split="val")
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["val/exception_rate"] == pytest.approx(0.5)
+
+    @pytest.mark.asyncio
+    async def test_generator_of_trajectories_is_consumed_once(self, tmp_path: Path):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        def trajectories():
+            yield Trajectory(
+                reward=1.0,
+                metrics={"custom": 1.0},
+                messages_and_choices=[{"role": "user", "content": "first"}],
+            )
+            yield Trajectory(
+                reward=3.0,
+                metrics={"custom": 3.0},
+                messages_and_choices=[{"role": "user", "content": "second"}],
+            )
+
+        await model.log(trajectories(), split="val")
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["val/reward"] == pytest.approx(2.0)
+        assert entry["val/custom"] == pytest.approx(2.0)
+
+    @pytest.mark.asyncio
+    async def test_train_trajectory_metrics_default_to_reward_prefix(
+        self, tmp_path: Path
+    ):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        trajectories = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.7,
+                        metrics={
+                            "custom_score": 1.0,
+                            "reward/prefixed": 2.0,
+                        },
+                        messages_and_choices=[{"role": "user", "content": "test"}],
+                    )
+                ],
+                exceptions=[],
+            )
+        ]
+
+        await model.log(trajectories, split="train")
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["reward/mean"] == 0.7
+        assert entry["reward/exception_rate"] == 0.0
+        assert "train/reward" not in entry
+        assert entry["reward/custom_score"] == 1.0
+        assert entry["reward/prefixed"] == 2.0
+
+    @pytest.mark.asyncio
+    async def test_train_logs_add_default_data_metrics_from_trajectory_groups(
+        self, tmp_path: Path
+    ):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        trajectories = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.8,
+                        messages_and_choices=[{"role": "user", "content": "a"}],
+                    ),
+                    Trajectory(
+                        reward=0.2,
+                        messages_and_choices=[{"role": "user", "content": "b"}],
+                    ),
+                ],
+                metadata={"scenario_id": "scenario-1"},
+            ),
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.5,
+                        messages_and_choices=[{"role": "user", "content": "c"}],
+                    )
+                ],
+                exceptions=[],
+                metadata={"scenario_id": "scenario-2"},
+            ),
+        ]
+
+        await model.log(trajectories, split="train", step=1)
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            rows = [json.loads(line) for line in f if line.strip()]
+
+        merged: dict[str, float] = {}
+        for row in rows:
+            merged.update(row)
+
+        assert merged["data/step_num_scenarios"] == pytest.approx(2.0)
+        assert merged["data/step_num_trajectories"] == pytest.approx(3.0)
+        assert merged["data/step_num_groups_submitted"] == pytest.approx(2.0)
+        assert merged["data/step_num_groups_trainable"] == pytest.approx(1.0)
+        assert merged["data/cum/num_unique_scenarios"] == pytest.approx(2.0)
+        assert merged["train/num_groups_submitted"] == pytest.approx(2.0)
+        assert merged["train/num_groups_trainable"] == pytest.approx(1.0)
+        assert merged["train/num_trajectories"] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=1,
+            metrics={
+                "costs/train/prefill": 0.2,
+                "costs/train/sample": 0.3,
+            },
+        )
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=2,
+            metrics={
+                "costs/train/prefill": 0.1,
+            },
+        )
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            first = json.loads(f.readline())
+            second = json.loads(f.readline())
+
+        assert first["costs/train/prefill"] == pytest.approx(0.2)
+        assert first["costs/train/sample"] == pytest.approx(0.3)
+        assert first["costs/train"] == pytest.approx(0.5)
+        assert first["costs/all"] == pytest.approx(0.5)
+        assert first["costs/cum/all"] == pytest.approx(0.5)
+
+        assert second["costs/train/prefill"] == pytest.approx(0.1)
+        assert second["costs/cum/train/prefill"] == pytest.approx(0.3)
+        assert second["costs/cum/train"] == pytest.approx(0.6)
+        assert second["costs/cum/all"] == pytest.approx(0.6)
+
+    @pytest.mark.asyncio
+    async def test_cost_cumulative_persists_across_model_recreation(
+        self, tmp_path: Path
+    ):
+        model_1 = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        await model_1.log(
+            trajectories=None,
+            split="train",
+            step=1,
+            metrics={"costs/train/prefill": 0.25},
+        )
+
+        model_2 = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        await model_2.log(
+            trajectories=None,
+            split="train",
+            step=2,
+            metrics={"costs/train/prefill": 0.75},
+        )
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            first = json.loads(f.readline())
+            second = json.loads(f.readline())
+
+        assert first["costs/cum/train/prefill"] == pytest.approx(0.25)
+        assert second["costs/cum/train/prefill"] == pytest.approx(1.0)
+        assert second["costs/cum/all"] == pytest.approx(1.0)
+
+    @pytest.mark.asyncio
+    async def test_metrics_builder_loads_resume_state_before_builder_use(
+        self, tmp_path: Path
+    ):
+        model_1 = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        model_1.metrics_builder().add_data(scenario_ids=["scenario-a"])
+        await model_1.log(trajectories=None, split="train", step=1, metrics={})
+
+        model_2 = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        model_2.metrics_builder().add_data(scenario_ids=["scenario-b"])
+        await model_2.log(trajectories=None, split="train", step=2, metrics={})
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            first = json.loads(f.readline())
+            second = json.loads(f.readline())
+
+        assert first["data/cum/num_unique_scenarios"] == pytest.approx(1.0)
+        assert second["data/cum/num_unique_scenarios"] == pytest.approx(2.0)
+
+    @pytest.mark.asyncio
+    async def test_direct_time_and_data_metrics_get_cumulative_variants(
+        self, tmp_path: Path
+    ):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=1,
+            metrics={
+                "time/step_actor_s": 1.5,
+                "data/step_actor_tokens": 10,
+            },
+        )
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["time/step_actor_s"] == pytest.approx(1.5)
+        assert entry["time/cum/actor_s"] == pytest.approx(1.5)
+        assert entry["data/step_actor_tokens"] == pytest.approx(10)
+        assert entry["data/cum/actor_tokens"] == pytest.approx(10)
+
+    @pytest.mark.asyncio
+    async def test_log_without_new_builder_metrics_skips_extra_taxonomy_row(
+        self, tmp_path: Path
+    ):
+        model = Model(
+            name="test",
+            project="test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        model.metrics_builder().add_data(scenario_ids=["scenario-a"])
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=1,
+            metrics={
+                "time/step_trainer_s": 2.0,
+                "data/step_trainer_tokens": 20.0,
+            },
+        )
+        await model.log(
+            trajectories=None,
+            split="train",
+            step=2,
+            metrics={"loss/train": 1.0},
+        )
+
+        history_path = tmp_path / "test/models/test/history.jsonl"
+        rows = [json.loads(line) for line in history_path.open() if line.strip()]
+
+        assert len(rows) == 2
+        assert rows[0]["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
+        assert rows[0]["data/cum/num_unique_scenarios"] == pytest.approx(1.0)
+        assert rows[1]["loss/train"] == pytest.approx(1.0)
+        assert "throughput/avg_trainer_tok_per_s" not in rows[1]
+        assert "data/cum/num_unique_scenarios" not in rows[1]
+
 
 class TestWandbIntegration:
     """Test wandb integration logic (without mocking wandb itself)."""
@@ -551,6 +898,95 @@ def test_should_log_wandb_logic_empty_list(self, tmp_path: Path):
             assert should_log is False
 
 
+class TestLocalBackendAutomaticMetrics:
+    @pytest.mark.asyncio
+    async def test_train_logs_automatic_wall_time_and_gpu_cost(
+        self, tmp_path: Path
+    ) -> None:
+        backend = LocalBackend(gpu_cost_per_hour_usd=3.0)
+
+        with patch("art.model.time.monotonic", side_effect=[100.0, 106.0, 111.0]):
+            model = TrainableModel(
+                name="test-model",
+                project="test-project",
+                base_model="Qwen/Qwen3-4B-Instruct-2507",
+                base_path=str(tmp_path),
+                report_metrics=[],
+                _internal_config={"trainer_gpu_ids": [0]},
+            )
+            model._backend = backend
+
+            await model.log(
+                trajectories=None,
+                split="train",
+                step=1,
+                metrics={"loss/train": 1.0},
+            )
+            await model.log(
+                trajectories=None,
+                split="train",
+                step=2,
+                metrics={"loss/train": 0.5},
+            )
+
+        history_path = tmp_path / "test-project/models/test-model/history.jsonl"
+        rows = [json.loads(line) for line in history_path.open() if line.strip()]
+
+        first_gpu_cost = 6.0 * 3.0 / 3600.0
+        second_gpu_cost = 5.0 * 3.0 / 3600.0
+
+        assert rows[0]["time/step_wall_s"] == pytest.approx(6.0)
+        assert rows[0]["costs/gpu"] == pytest.approx(first_gpu_cost)
+        assert rows[0]["costs/all"] == pytest.approx(first_gpu_cost)
+        assert rows[0]["costs/cum/gpu"] == pytest.approx(first_gpu_cost)
+
+        assert rows[1]["time/step_wall_s"] == pytest.approx(5.0)
+        assert rows[1]["costs/gpu"] == pytest.approx(second_gpu_cost)
+        assert rows[1]["costs/cum/gpu"] == pytest.approx(
+            first_gpu_cost + second_gpu_cost
+        )
+        assert rows[1]["costs/cum/all"] == pytest.approx(
+            first_gpu_cost + second_gpu_cost
+        )
+
+    @pytest.mark.asyncio
+    async def test_unknown_local_gpu_skips_cost_but_keeps_wall_time(
+        self, tmp_path: Path
+    ) -> None:
+        backend = LocalBackend()
+
+        with patch("art.model.time.monotonic", side_effect=[50.0, 55.0]):
+            with patch("art.local.backend.torch.cuda.is_available", return_value=True):
+                with patch("art.local.backend.torch.cuda.device_count", return_value=1):
+                    with patch(
+                        "art.local.backend.torch.cuda.get_device_name",
+                        return_value="NVIDIA A100-SXM4-80GB",
+                    ):
+                        model = TrainableModel(
+                            name="test-model",
+                            project="test-project",
+                            base_model="Qwen/Qwen3-4B-Instruct-2507",
+                            base_path=str(tmp_path),
+                            report_metrics=[],
+                            _internal_config={"trainer_gpu_ids": [0]},
+                        )
+                        model._backend = backend
+                        await model.log(
+                            trajectories=None,
+                            split="train",
+                            step=1,
+                            metrics={"loss/train": 1.0},
+                        )
+
+        history_path = tmp_path / "test-project/models/test-model/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["time/step_wall_s"] == pytest.approx(5.0)
+        assert "costs/gpu" not in entry
+        assert "costs/all" not in entry
+
+
 class TestModelAttributes:
     """Test new Model attributes."""
 
@@ -593,9 +1029,21 @@ async def test_train_sft_aggregates_metrics(self, tmp_path: Path):
 
         async def mock_train_sft(*args, **kwargs):
             # Simulate 3 batches with different metrics
-            yield {"loss": 1.0, "learning_rate": 1e-4, "grad_norm": 0.5}
-            yield {"loss": 0.8, "learning_rate": 1e-4, "grad_norm": 0.4}
-            yield {"loss": 0.6, "learning_rate": 1e-4, "grad_norm": 0.3}
+            yield {
+                "loss/train": 1.0,
+                "loss/learning_rate": 1e-4,
+                "loss/grad_norm": 0.5,
+            }
+            yield {
+                "loss/train": 0.8,
+                "loss/learning_rate": 1e-4,
+                "loss/grad_norm": 0.4,
+            }
+            yield {
+                "loss/train": 0.6,
+                "loss/learning_rate": 1e-4,
+                "loss/grad_norm": 0.3,
+            }
 
         mock_backend._train_sft = mock_train_sft
         mock_backend._get_step = AsyncMock(return_value=1)  # Step after training
@@ -625,11 +1073,16 @@ async def mock_train_sft(*args, **kwargs):
 
         assert len(lines) == 1, f"Expected 1 log entry, got {len(lines)}"
 
-        # Verify metrics are aggregated (averaged)
-        entry = json.loads(lines[0])
-        assert entry["step"] == 1
-        assert entry["train/loss"] == pytest.approx(0.8)  # (1.0 + 0.8 + 0.6) / 3
-        assert entry["train/grad_norm"] == pytest.approx(0.4)  # (0.5 + 0.4 + 0.3) / 3
+        entries = [json.loads(line) for line in lines]
+        merged: dict[str, float] = {}
+        for entry in entries:
+            merged.update(entry)
+
+        assert all(entry["step"] == 1 for entry in entries)
+        assert merged["loss/train"] == pytest.approx(0.8)  # (1.0 + 0.8 + 0.6) / 3
+        assert merged["loss/grad_norm"] == pytest.approx(0.4)  # (0.5 + 0.4 + 0.3) / 3
+        assert merged["time/step_trainer_s"] >= 0
+        assert merged["time/cum/trainer_s"] >= 0
 
     @pytest.mark.asyncio
     async def test_train_sft_single_step_increment(self, tmp_path: Path):
@@ -667,7 +1120,7 @@ async def mock_train_sft(*args, **kwargs):
         df = pl.read_ndjson(str(history_path))
 
         assert len(df) == 1, "Should have exactly 1 log entry"
-        assert df["step"][0] == 1, "Step should be 1 (single increment)"
+        assert set(df["step"].to_list()) == {1}, "Step should be 1 (single increment)"
 
     @pytest.mark.asyncio
     async def test_train_sft_no_metrics_when_empty(self, tmp_path: Path):
@@ -698,3 +1151,92 @@ async def mock_train_sft(*args, **kwargs):
         assert not history_path.exists(), (
             "No history.jsonl should be created for empty training"
         )
+
+
+class TestGradientStepMetrics:
+    @pytest.mark.asyncio
+    async def test_model_train_logs_gradient_step_count(self, tmp_path: Path):
+        model = TrainableModel(
+            name="test-train",
+            project="test-project",
+            base_model="gpt-4",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        async def mock_train_model(*args, **kwargs):
+            for loss in (1.0, 0.8, 0.6):
+                yield {
+                    "loss/train": loss,
+                    TRAIN_GRADIENT_STEPS_KEY: 3.0,
+                }
+
+        mock_backend = MagicMock()
+        mock_backend._train_model = mock_train_model
+        mock_backend._get_step = AsyncMock(return_value=1)
+        model._backend = mock_backend
+
+        groups = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=1.0,
+                        messages_and_choices=[
+                            {"role": "user", "content": "hello"},
+                            {"role": "assistant", "content": "hi"},
+                        ],
+                    )
+                ]
+            )
+        ]
+
+        await model.train(groups)
+
+        history_path = tmp_path / "test-project/models/test-train/history.jsonl"
+        rows = [json.loads(line) for line in history_path.open() if line.strip()]
+        merged: dict[str, float] = {}
+        for row in rows:
+            merged.update(row)
+
+        assert merged[TRAIN_GRADIENT_STEPS_KEY] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_local_backend_train_returns_gradient_step_count(
+        self, tmp_path: Path
+    ):
+        model = TrainableModel(
+            name="test-backend-train",
+            project="test-project",
+            base_model="gpt-4",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        backend = LocalBackend(path=str(tmp_path))
+
+        async def mock_train_model(*args, **kwargs):
+            for loss in (1.0, 0.8):
+                yield {
+                    "loss/train": loss,
+                    TRAIN_GRADIENT_STEPS_KEY: 2.0,
+                }
+
+        backend._train_model = mock_train_model  # type: ignore[method-assign]
+        backend._get_step = AsyncMock(return_value=1)  # type: ignore[method-assign]
+
+        groups = [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=1.0,
+                        messages_and_choices=[
+                            {"role": "user", "content": "hello"},
+                            {"role": "assistant", "content": "hi"},
+                        ],
+                    )
+                ]
+            )
+        ]
+
+        result = await backend.train(model, groups, save_checkpoint=False)
+
+        assert result.metrics[TRAIN_GRADIENT_STEPS_KEY] == pytest.approx(2.0)
diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py
new file mode 100644
index 000000000..8f6cad928
--- /dev/null
+++ b/tests/unit/test_metric_routing.py
@@ -0,0 +1,122 @@
+import json
+import os
+from pathlib import Path
+import types
+from unittest.mock import MagicMock, patch
+
+from art import Model
+
+
+class TestMetricRoutingBaseline:
+    def test_log_metrics_routes_known_sections_without_split_prefix(
+        self, tmp_path: Path
+    ) -> None:
+        model = Model(
+            name="test-model",
+            project="test-project",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        model._log_metrics(
+            {
+                "reward/mean": 0.9,
+                "custom": 1.0,
+                "rewardish/value": 2.0,
+            },
+            split="train",
+            step=7,
+        )
+
+        history_path = tmp_path / "test-project/models/test-model/history.jsonl"
+        with open(history_path) as f:
+            entry = json.loads(f.readline())
+
+        assert entry["reward/mean"] == 0.9
+        assert entry["train/custom"] == 1.0
+        assert entry["train/rewardish/value"] == 2.0
+        assert entry["training_step"] == 7
+        assert entry["time/wall_clock_sec"] >= 0
+
+    def test_get_wandb_run_registers_taxonomy_sections(self, tmp_path: Path) -> None:
+        fake_run = MagicMock()
+        fake_run._is_finished = False
+
+        fake_wandb = types.SimpleNamespace()
+        fake_wandb.init = MagicMock(return_value=fake_run)
+        fake_wandb.define_metric = MagicMock()
+        fake_wandb.Settings = lambda **kwargs: kwargs
+
+        with patch.dict(os.environ, {"WANDB_API_KEY": "test-key"}, clear=False):
+            with patch.dict("sys.modules", {"wandb": fake_wandb}):
+                model = Model(
+                    name="test-model",
+                    project="test-project",
+                    base_path=str(tmp_path),
+                )
+                run = model._get_wandb_run()
+
+        assert run is fake_run
+        define_calls = [
+            (call.args, call.kwargs) for call in fake_wandb.define_metric.call_args_list
+        ]
+        assert define_calls == [
+            (("training_step",), {}),
+            (("time/wall_clock_sec",), {}),
+            (("reward/*",), {"step_metric": "training_step"}),
+            (("loss/*",), {"step_metric": "training_step"}),
+            (("throughput/*",), {"step_metric": "training_step"}),
+            (("costs/*",), {"step_metric": "training_step"}),
+            (("time/*",), {"step_metric": "training_step"}),
+            (("data/*",), {"step_metric": "training_step"}),
+            (("train/*",), {"step_metric": "training_step"}),
+            (("val/*",), {"step_metric": "training_step"}),
+            (("test/*",), {"step_metric": "training_step"}),
+        ]
+
+    def test_log_metrics_defines_nested_cost_keys_with_training_step(
+        self, tmp_path: Path
+    ) -> None:
+        fake_run = MagicMock()
+        fake_run._is_finished = False
+
+        fake_wandb = types.SimpleNamespace()
+        fake_wandb.init = MagicMock(return_value=fake_run)
+        fake_wandb.define_metric = MagicMock()
+        fake_wandb.Settings = lambda **kwargs: kwargs
+
+        with patch.dict(os.environ, {"WANDB_API_KEY": "test-key"}, clear=False):
+            with patch.dict("sys.modules", {"wandb": fake_wandb}):
+                model = Model(
+                    name="test-model",
+                    project="test-project",
+                    base_path=str(tmp_path),
+                    report_metrics=["wandb"],
+                )
+                model._log_metrics(
+                    {
+                        "costs/train/sample": 0.1,
+                        "costs/cum/train/prefill": 0.2,
+                    },
+                    split="train",
+                    step=1,
+                )
+
+        define_calls = [
+            (call.args, call.kwargs) for call in fake_wandb.define_metric.call_args_list
+        ]
+        assert (
+            ("costs/train/sample",),
+            {"step_metric": "training_step"},
+        ) in define_calls
+        assert (
+            ("costs/cum/train/prefill",),
+            {"step_metric": "training_step"},
+        ) in define_calls
+        fake_run.log.assert_called_once()
+        logged_metrics = fake_run.log.call_args.args[0]
+        assert logged_metrics["costs/train/sample"] == 0.1
+        assert logged_metrics["costs/cum/train/prefill"] == 0.2
+        assert logged_metrics["training_step"] == 1
+        assert "time/wall_clock_sec" in logged_metrics
+        assert fake_run.log.call_args.kwargs == {}
diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py
new file mode 100644
index 000000000..dfa24a113
--- /dev/null
+++ b/tests/unit/test_metrics_builder.py
@@ -0,0 +1,254 @@
+import asyncio
+
+import pytest
+
+from art.metrics import MetricsBuilder
+
+
+class TestMetricsBuilder:
+    @pytest.mark.asyncio
+    async def test_rollup_correctness_across_depths(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_cost("train/llm_judge/general_judge", usd=0.08)
+        builder.add_cost("train/llm_judge/hallucination_judge", usd=0.04)
+        builder.add_cost("train/tinker_train", usd=1.20)
+        builder.add_cost("train/tinker_inference", usd=0.45)
+        builder.add_cost("eval/llm_judge/correctness", usd=0.06)
+
+        metrics = await builder.flush()
+
+        assert metrics["costs/train/llm_judge"] == pytest.approx(0.12)
+        assert metrics["costs/train"] == pytest.approx(1.77)
+        assert metrics["costs/eval"] == pytest.approx(0.06)
+        assert metrics["costs/all"] == pytest.approx(1.83)
+        assert metrics["costs/cum/train/llm_judge"] == pytest.approx(0.12)
+        assert metrics["costs/cum/train"] == pytest.approx(1.77)
+        assert metrics["costs/cum/all"] == pytest.approx(1.83)
+
+    @pytest.mark.asyncio
+    async def test_cum_accumulates_for_hierarchical_sections(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        builder.add_user_timing(step_wall_s=1.5, step_actor_s=0.3)
+        builder.add_data(
+            step_num_scenarios=2,
+            step_actor_tokens=10,
+            scenario_ids=["a", "b"],
+        )
+        first = await builder.flush()
+
+        assert first["time/cum/wall_s"] == pytest.approx(1.5)
+        assert first["time/cum/actor_s"] == pytest.approx(0.3)
+        assert first["data/cum/num_scenarios"] == pytest.approx(2)
+        assert first["data/cum/actor_tokens"] == pytest.approx(10)
+        assert first["data/cum/num_unique_scenarios"] == 2
+
+        builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2)
+        builder.add_data(
+            step_num_scenarios=3,
+            step_actor_tokens=5,
+            scenario_ids=["b", "c"],
+        )
+        second = await builder.flush()
+
+        assert second["time/cum/wall_s"] == pytest.approx(2.0)
+        assert second["time/cum/actor_s"] == pytest.approx(0.5)
+        assert second["data/cum/num_scenarios"] == pytest.approx(5)
+        assert second["data/cum/actor_tokens"] == pytest.approx(15)
+        assert second["data/cum/num_unique_scenarios"] == 3
+
+    @pytest.mark.asyncio
+    async def test_helper_metrics_accumulate_within_a_single_step(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        builder.add_data(step_num_scenarios=2, step_actor_tokens=10)
+        builder.add_data(step_num_scenarios=3, step_actor_tokens=5)
+        builder.add_user_timing(step_wall_s=1.5, step_actor_s=0.3, step_eval_s=0.2)
+        builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2, step_eval_s=0.1)
+        builder.add_idle_times(step_trainer_idle_s=1.0, step_actor_idle_s=2.0)
+        builder.add_idle_times(step_trainer_idle_s=0.5, step_actor_idle_s=1.0)
+
+        metrics = await builder.flush()
+
+        assert metrics["data/step_num_scenarios"] == pytest.approx(5)
+        assert metrics["data/step_actor_tokens"] == pytest.approx(15)
+        assert metrics["time/step_wall_s"] == pytest.approx(2.0)
+        assert metrics["time/step_actor_s"] == pytest.approx(0.5)
+        assert metrics["time/step_eval_s"] == pytest.approx(0.3)
+        assert metrics["throughput/step_trainer_idle_s"] == pytest.approx(1.5)
+        assert metrics["throughput/step_actor_idle_s"] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_throughput_metrics_derive_from_time_and_token_cumulatives(
+        self,
+    ) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        builder.add_metric("time/step_trainer_s", 4.0)
+        builder.add_metric("data/step_trainer_tokens", 40.0)
+        builder.add_metric("time/step_actor_s", 2.0)
+        builder.add_metric("data/step_actor_tokens", 10.0)
+        builder.add_idle_times(step_trainer_idle_s=1.5, step_actor_idle_s=0.5)
+
+        metrics = await builder.flush()
+
+        assert metrics["throughput/cum/trainer_idle_s"] == pytest.approx(1.5)
+        assert metrics["throughput/cum/actor_idle_s"] == pytest.approx(0.5)
+        assert metrics["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
+        assert metrics["throughput/avg_actor_tok_per_s"] == pytest.approx(5.0)
+
+    @pytest.mark.asyncio
+    async def test_costs_all_generated_for_single_and_multiple_children(self) -> None:
+        single = MetricsBuilder(cost_context="train")
+        single.add_cost("train/gpu", usd=2.0)
+        one = await single.flush()
+        assert one["costs/all"] == pytest.approx(2.0)
+
+        multi = MetricsBuilder(cost_context="train")
+        multi.add_cost("train/gpu", usd=2.0)
+        multi.add_cost("eval/llm_judge/correctness", usd=0.5)
+        two = await multi.flush()
+        assert two["costs/all"] == pytest.approx(2.5)
+
+    def test_leaf_parent_conflicts_raise(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_cost("train", usd=1.0)
+        with pytest.raises(ValueError):
+            builder.add_cost("train/llm_judge", usd=0.1)
+
+        other = MetricsBuilder(cost_context="train")
+        other.add_cost("train/llm_judge", usd=0.1)
+        with pytest.raises(ValueError):
+            other.add_cost("train", usd=1.0)
+
+    @pytest.mark.asyncio
+    async def test_duplicate_leaf_writes_are_summed(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_cost("train/gpu", usd=1.25)
+        builder.add_cost("train/gpu", usd=0.75)
+
+        metrics = await builder.flush()
+
+        assert metrics["costs/train/gpu"] == pytest.approx(2.0)
+        assert metrics["costs/train"] == pytest.approx(2.0)
+        assert metrics["costs/all"] == pytest.approx(2.0)
+
+    def test_cumulative_namespace_is_reserved(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        with pytest.raises(ValueError):
+            builder.add_metric("costs/cum/train/llm_judge", 0.1)
+
+    @pytest.mark.asyncio
+    async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_cost("train/gpu", usd=1.0)
+        first = await builder.flush()
+        assert first["costs/cum/train"] == pytest.approx(1.0)
+
+        second = await builder.flush()
+        assert not any(key.startswith("costs/") for key in second)
+
+        builder.add_cost("train/gpu", usd=2.0)
+        third = await builder.flush()
+        assert third["costs/train"] == pytest.approx(2.0)
+        assert third["costs/cum/train"] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None:
+        before = MetricsBuilder(cost_context="train")
+        before.add_cost("train/gpu", usd=1.0)
+        await before.flush()
+
+        state = before.state_dict()
+        after = MetricsBuilder(cost_context="train")
+        after.load_state_dict(state)
+        after.add_cost("train/gpu", usd=2.0)
+
+        metrics = await after.flush()
+        assert metrics["costs/cum/train"] == pytest.approx(3.0)
+        assert metrics["costs/cum/all"] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None:
+        before = MetricsBuilder(cost_context="train")
+        before.add_cost("train/gpu", usd=1.0)
+        await before.flush()
+
+        after = MetricsBuilder(cost_context="train")
+        after.load_state_dict(before.state_dict())
+
+        eval_builder = after.for_cost_context("eval")
+        eval_builder.add_cost("eval/judge", usd=2.0)
+
+        metrics = await eval_builder.flush()
+        assert metrics["costs/eval/judge"] == pytest.approx(2.0)
+        assert metrics["costs/cum/all"] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_add_response_cost_uses_registered_model_pricing(self) -> None:
+        builder = MetricsBuilder(cost_context="eval")
+        builder.register_model_pricing(
+            "anthropic/test-judge",
+            prompt_per_million=5.0,
+            completion_per_million=7.0,
+        )
+
+        cost = builder.add_response_cost(
+            "llm_judge/faithfulness",
+            {
+                "model": "anthropic/test-judge",
+                "usage": {"input_tokens": 40, "output_tokens": 60},
+            },
+            provider="anthropic",
+            model_name="anthropic/test-judge",
+        )
+
+        metrics = await builder.flush()
+        assert cost == pytest.approx(0.00062)
+        assert metrics["costs/eval/llm_judge/faithfulness"] == pytest.approx(0.00062)
+
+    @pytest.mark.asyncio
+    async def test_unique_scenario_count_tracks_exact_ids(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_data(scenario_ids=["s1", "s2", "s3"])
+        first = await builder.flush()
+        assert first["data/cum/num_unique_scenarios"] == 3
+
+        builder.add_data(scenario_ids=["s2", "s4"])
+        second = await builder.flush()
+        assert second["data/cum/num_unique_scenarios"] == 4
+
+    @pytest.mark.asyncio
+    async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.add_metric("time/step_trainer_s", 2.0)
+        builder.add_metric("data/step_trainer_tokens", 20.0)
+        builder.add_data(scenario_ids=["s1"])
+
+        first = await builder.flush()
+        assert first["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0)
+        assert first["data/cum/num_unique_scenarios"] == 1
+
+        second = await builder.flush()
+        assert second == {}
+
+    @pytest.mark.asyncio
+    async def test_concurrent_add_cost_calls_do_not_lose_updates(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        async def worker() -> None:
+            for _ in range(25):
+                builder.add_cost("train/gpu", usd=0.1)
+                await asyncio.sleep(0)
+
+        await asyncio.gather(*(worker() for _ in range(4)))
+        metrics = await builder.flush()
+
+        assert metrics["costs/train/gpu"] == pytest.approx(10.0)
+        assert metrics["costs/all"] == pytest.approx(10.0)
+
+    def test_contextvar_activate_and_get_active(self) -> None:
+        builder = MetricsBuilder(cost_context="eval")
+        token = builder.activate()
+        assert MetricsBuilder.get_active() is builder
+        token.var.reset(token)
diff --git a/tests/unit/test_metrics_taxonomy.py b/tests/unit/test_metrics_taxonomy.py
new file mode 100644
index 000000000..b2eaadc0f
--- /dev/null
+++ b/tests/unit/test_metrics_taxonomy.py
@@ -0,0 +1,80 @@
+import pytest
+
+from art import Trajectory, TrajectoryGroup
+from art.metrics_taxonomy import (
+    TRAIN_GRADIENT_STEPS_KEY,
+    TrajectoryBatchSummary,
+    average_metric_samples,
+    build_training_summary_metrics,
+    summarize_trajectory_groups,
+)
+
+
+def test_average_metric_samples_handles_sparse_keys() -> None:
+    averaged = average_metric_samples(
+        [
+            {"loss/train": 1.0, "loss/grad_norm": 0.5},
+            {"loss/train": 0.5},
+            {"loss/grad_norm": 1.0},
+        ]
+    )
+
+    assert averaged["loss/train"] == pytest.approx(0.75)
+    assert averaged["loss/grad_norm"] == pytest.approx(0.75)
+
+
+def test_build_training_summary_metrics_includes_data_and_train_sections() -> None:
+    summary = TrajectoryBatchSummary(
+        num_scenarios=2,
+        num_trajectories=5,
+        num_groups_submitted=2,
+        num_groups_trainable=1,
+        scenario_ids=["a", "b"],
+    )
+
+    metrics = build_training_summary_metrics(
+        summary,
+        include_trainable_groups=True,
+    )
+
+    assert metrics["data/step_num_scenarios"] == pytest.approx(2.0)
+    assert metrics["data/step_num_groups_trainable"] == pytest.approx(1.0)
+    assert metrics["train/num_groups_submitted"] == pytest.approx(2.0)
+    assert metrics["train/num_trajectories"] == pytest.approx(5.0)
+
+
+def test_average_metric_samples_requires_invariant_gradient_step_count() -> None:
+    with pytest.raises(ValueError, match="must be invariant"):
+        average_metric_samples(
+            [
+                {TRAIN_GRADIENT_STEPS_KEY: 2.0},
+                {TRAIN_GRADIENT_STEPS_KEY: 3.0},
+            ]
+        )
+
+
+def test_summarize_trajectory_groups_only_counts_explicit_scenario_id() -> None:
+    summary = summarize_trajectory_groups(
+        [
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=1.0,
+                        messages_and_choices=[{"role": "user", "content": "a"}],
+                    )
+                ],
+                metadata={"scenario_id": "scenario-1"},
+            ),
+            TrajectoryGroup(
+                trajectories=[
+                    Trajectory(
+                        reward=0.0,
+                        messages_and_choices=[{"role": "user", "content": "b"}],
+                    )
+                ],
+                metadata={"scenario_scenario_id": "legacy-scenario"},
+            ),
+        ]
+    )
+
+    assert summary.scenario_ids == ["scenario-1"]
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
new file mode 100644
index 000000000..80553e48b
--- /dev/null
+++ b/tests/unit/test_track_api_cost.py
@@ -0,0 +1,554 @@
+import asyncio
+import json
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+from art import Model, TrainableModel, Trajectory, TrajectoryGroup
+from art.costs import compute_sample_costs, get_model_pricing
+from art.metrics import MetricsBuilder, track_api_cost
+from art.pipeline_trainer.trainer import PipelineTrainer
+
+
+class _OpenAIUsage:
+    def __init__(
+        self,
+        prompt_tokens: int,
+        completion_tokens: int,
+        *,
+        cached_tokens: int = 0,
+    ) -> None:
+        self.prompt_tokens = prompt_tokens
+        self.completion_tokens = completion_tokens
+        self.prompt_tokens_details = type(
+            "PromptTokensDetails",
+            (),
+            {"cached_tokens": cached_tokens},
+        )()
+
+
+class _OpenAIResponse:
+    def __init__(
+        self,
+        prompt_tokens: int,
+        completion_tokens: int,
+        *,
+        cached_tokens: int = 0,
+        model: str | None = None,
+    ) -> None:
+        self.usage = _OpenAIUsage(
+            prompt_tokens,
+            completion_tokens,
+            cached_tokens=cached_tokens,
+        )
+        self.model = model
+
+
+class _AnthropicUsage:
+    def __init__(
+        self,
+        input_tokens: int,
+        output_tokens: int,
+        *,
+        cache_creation_input_tokens: int = 0,
+        cache_read_input_tokens: int = 0,
+    ) -> None:
+        self.input_tokens = input_tokens
+        self.output_tokens = output_tokens
+        self.cache_creation_input_tokens = cache_creation_input_tokens
+        self.cache_read_input_tokens = cache_read_input_tokens
+
+
+class _AnthropicResponse:
+    def __init__(
+        self,
+        input_tokens: int,
+        output_tokens: int,
+        *,
+        cache_creation_input_tokens: int = 0,
+        cache_read_input_tokens: int = 0,
+        model: str | None = None,
+    ) -> None:
+        self.usage = _AnthropicUsage(
+            input_tokens,
+            output_tokens,
+            cache_creation_input_tokens=cache_creation_input_tokens,
+            cache_read_input_tokens=cache_read_input_tokens,
+        )
+        self.model = model
+
+
+class TestTrackApiCost:
+    @pytest.mark.asyncio
+    async def test_openai_cost_extraction_with_explicit_pricing(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        @track_api_cost(
+            source="llm_judge/correctness",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=100, completion_tokens=50)
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002)
+
+    @pytest.mark.asyncio
+    async def test_openai_cost_extraction_accounts_for_cached_tokens(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        @track_api_cost(
+            source="llm_judge/cached_openai",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+            prompt_price_per_million=2.0,
+            completion_price_per_million=8.0,
+            cached_prompt_price_per_million=0.5,
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(
+                prompt_tokens=2_000,
+                completion_tokens=100,
+                cached_tokens=1_500,
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/train/llm_judge/cached_openai"] == pytest.approx(0.00255)
+
+    @pytest.mark.asyncio
+    async def test_anthropic_cost_extraction_uses_registered_model_pricing(
+        self,
+    ) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.register_model_pricing(
+            "anthropic/test-judge",
+            prompt_per_million=5.0,
+            completion_per_million=7.0,
+        )
+
+        @track_api_cost(
+            source="llm_judge/faithfulness",
+            provider="anthropic",
+            model_name="anthropic/test-judge",
+        )
+        async def _judge() -> _AnthropicResponse:
+            return _AnthropicResponse(input_tokens=40, output_tokens=60)
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062)
+
+    @pytest.mark.asyncio
+    async def test_anthropic_cost_extraction_accounts_for_cache_write_and_read(
+        self,
+    ) -> None:
+        builder = MetricsBuilder(cost_context="eval")
+        builder.register_model_pricing(
+            "anthropic/claude-sonnet-4-6",
+            prompt_per_million=3.0,
+            completion_per_million=15.0,
+            cache_creation_per_million=3.75,
+            cache_read_per_million=0.30,
+        )
+
+        @track_api_cost(
+            source="llm_judge/anthropic_cache",
+            provider="anthropic",
+            model_name="anthropic/claude-sonnet-4-6",
+        )
+        async def _judge() -> _AnthropicResponse:
+            return _AnthropicResponse(
+                input_tokens=100,
+                output_tokens=50,
+                cache_creation_input_tokens=1_000,
+                cache_read_input_tokens=500,
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/eval/llm_judge/anthropic_cache"] == pytest.approx(0.00495)
+
+    @pytest.mark.asyncio
+    async def test_explicit_model_name_uses_global_pricing(
+        self,
+    ) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        pricing = get_model_pricing("openai/gpt-oss-20b")
+        assert pricing is not None
+
+        @track_api_cost(
+            source="llm_judge/global_pricing",
+            provider="openai",
+            model_name="openai/gpt-oss-20b",
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(
+                prompt_tokens=1_000,
+                completion_tokens=2_000,
+                model="gpt-oss-20b",
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        expected = compute_sample_costs(
+            prompt_tokens=1_000,
+            completion_tokens=2_000,
+            cost_context="train",
+            pricing=pricing,
+        )
+        assert metrics["costs/train/llm_judge/global_pricing"] == pytest.approx(
+            expected["costs/train/prefill"] + expected["costs/train/sample"]
+        )
+
+    @pytest.mark.asyncio
+    async def test_explicit_model_name_uses_registered_pricing(
+        self,
+    ) -> None:
+        builder = MetricsBuilder(cost_context="eval")
+        builder.register_model_pricing(
+            "anthropic/test-judge",
+            prompt_per_million=1.5,
+            completion_per_million=2.5,
+        )
+
+        @track_api_cost(
+            source="llm_judge/provider_resolution",
+            provider="anthropic",
+            model_name="anthropic/test-judge",
+        )
+        async def _judge() -> _AnthropicResponse:
+            return _AnthropicResponse(
+                input_tokens=400,
+                output_tokens=600,
+                model="test-judge",
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/eval/llm_judge/provider_resolution"] == pytest.approx(
+            0.0021
+        )
+
+    @pytest.mark.asyncio
+    async def test_explicit_model_name_does_not_depend_on_response_model(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        @track_api_cost(
+            source="llm_judge/snapshot",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(
+                prompt_tokens=1_000,
+                completion_tokens=100,
+                cached_tokens=800,
+                model="gpt-4.1-2025-04-14",
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        expected = ((200 * 2.0) + (800 * 0.5) + (100 * 8.0)) / 1_000_000
+        assert metrics["costs/train/llm_judge/snapshot"] == pytest.approx(expected)
+
+    @pytest.mark.asyncio
+    async def test_decorator_fails_fast_without_model_aware_pricing(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+
+        @track_api_cost(
+            source="llm_judge/missing_pricing",
+            provider="openai",
+            model_name="openai/missing-pricing-model",
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=10, completion_tokens=20)
+
+        token = builder.activate()
+        try:
+            with pytest.raises(ValueError, match="No pricing configured"):
+                await _judge()
+        finally:
+            token.var.reset(token)
+
+    @pytest.mark.asyncio
+    async def test_custom_extractor_takes_precedence(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        builder.register_cost_extractor("openai", lambda _response: 0.75)
+
+        @track_api_cost(
+            source="llm_judge/custom",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=1, completion_tokens=1)
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/train/llm_judge/custom"] == pytest.approx(0.75)
+
+    @pytest.mark.asyncio
+    async def test_decorator_noops_without_active_builder(self) -> None:
+        @track_api_cost(
+            source="llm_judge/no_context",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=10, completion_tokens=20)
+
+        result = await _judge()
+        assert isinstance(result, _OpenAIResponse)
+
+    @pytest.mark.asyncio
+    async def test_for_cost_context_routes_to_eval_and_shares_state(self) -> None:
+        builder = MetricsBuilder(cost_context="train")
+        eval_builder = builder.for_cost_context("eval")
+
+        @track_api_cost(
+            source="llm_judge/correctness",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=100, completion_tokens=50)
+
+        token = eval_builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        assert metrics["costs/eval/llm_judge/correctness"] == pytest.approx(0.0002)
+
+
+class TestTrackApiCostIntegration:
+    @pytest.mark.asyncio
+    async def test_model_log_emits_train_and_eval_costs(self, tmp_path: Path) -> None:
+        model = Model(
+            name="metrics-cost-test",
+            project="metrics-cost-test",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+
+        @track_api_cost(
+            source="llm_judge/correctness",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _train_judge() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=100, completion_tokens=50)
+
+        @track_api_cost(
+            source="llm_judge/factuality",
+            provider="anthropic",
+            model_name="anthropic/claude-sonnet-4-6",
+            prompt_price_per_million=3.0,
+            completion_price_per_million=4.0,
+        )
+        async def _eval_judge() -> _AnthropicResponse:
+            return _AnthropicResponse(input_tokens=40, output_tokens=10)
+
+        train_token = model.activate_metrics_context("train")
+        try:
+            await _train_judge()
+        finally:
+            train_token.var.reset(train_token)
+
+        await model.log(trajectories=None, split="train", step=1, metrics={})
+
+        eval_token = model.activate_metrics_context("eval")
+        try:
+            await _eval_judge()
+        finally:
+            eval_token.var.reset(eval_token)
+
+        await model.log(trajectories=None, split="val", step=2, metrics={})
+
+        history_path = (
+            tmp_path
+            / "metrics-cost-test"
+            / "models"
+            / "metrics-cost-test"
+            / "history.jsonl"
+        )
+        with open(history_path) as f:
+            first = json.loads(f.readline())
+            second = json.loads(f.readline())
+
+        assert first["costs/train/llm_judge/correctness"] == pytest.approx(0.0002)
+        assert second["costs/eval/llm_judge/factuality"] == pytest.approx(0.00016)
+        assert second["costs/cum/all"] == pytest.approx(0.00036)
+
+    @pytest.mark.asyncio
+    async def test_pipeline_trainer_activates_train_context_for_rollouts(
+        self, tmp_path: Path
+    ) -> None:
+        model = TrainableModel(
+            name="pipeline-context-test",
+            project="pipeline-context-test",
+            base_model="test-model",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        backend = MagicMock()
+        observed_contexts: list[str] = []
+
+        async def rollout_fn(
+            _model: TrainableModel,
+            _scenario: dict,
+            _config: dict,
+        ) -> TrajectoryGroup:
+            observed_contexts.append(MetricsBuilder.get_active().cost_context)
+            return TrajectoryGroup(
+                [
+                    Trajectory(
+                        reward=1.0,
+                        messages_and_choices=[
+                            {"role": "user", "content": "hello"},
+                            {"role": "assistant", "content": "hi"},
+                        ],
+                    )
+                ]
+            )
+
+        trainer = PipelineTrainer(
+            model=model,
+            backend=backend,
+            rollout_fn=rollout_fn,
+            scenarios=[{"metadata": {"scenario_id": "s1"}}],
+            config={},
+            num_rollout_workers=1,
+            min_batch_size=1,
+            max_batch_size=1,
+            eval_fn=None,
+        )
+        trainer._output_queue = asyncio.Queue()
+
+        await trainer._rollout_worker(worker_id=0)
+
+        assert observed_contexts == ["train"]
+
+    @pytest.mark.asyncio
+    async def test_pipeline_trainer_activates_eval_context_for_eval_fn(
+        self, tmp_path: Path
+    ) -> None:
+        model = TrainableModel(
+            name="pipeline-eval-context-test",
+            project="pipeline-eval-context-test",
+            base_model="test-model",
+            base_path=str(tmp_path),
+            report_metrics=[],
+        )
+        backend = MagicMock()
+        observed_contexts: list[str] = []
+
+        @track_api_cost(
+            source="llm_judge/correctness",
+            provider="openai",
+            model_name="openai/gpt-4.1",
+            prompt_price_per_million=1.0,
+            completion_price_per_million=2.0,
+        )
+        async def _judge_call() -> _OpenAIResponse:
+            return _OpenAIResponse(prompt_tokens=100, completion_tokens=50)
+
+        async def eval_fn(
+            _model: TrainableModel,
+            _step: int,
+            _config: dict,
+        ) -> list[Trajectory]:
+            observed_contexts.append(MetricsBuilder.get_active().cost_context)
+            await _judge_call()
+            return [
+                Trajectory(
+                    reward=1.0,
+                    messages_and_choices=[
+                        {"role": "user", "content": "hello"},
+                        {"role": "assistant", "content": "hi"},
+                    ],
+                )
+            ]
+
+        trainer = PipelineTrainer(
+            model=model,
+            backend=backend,
+            rollout_fn=lambda *_args, **_kwargs: asyncio.sleep(0),
+            scenarios=[],
+            config={},
+            num_rollout_workers=1,
+            min_batch_size=1,
+            max_batch_size=1,
+            eval_fn=eval_fn,
+        )
+
+        await trainer._run_eval(step=1)
+
+        assert observed_contexts == ["eval"]
+
+        history_path = (
+            tmp_path
+            / "pipeline-eval-context-test"
+            / "models"
+            / "pipeline-eval-context-test"
+            / "history.jsonl"
+        )
+        with open(history_path) as f:
+            rows = [json.loads(line) for line in f if line.strip()]
+
+        assert any("costs/eval/llm_judge/correctness" in row for row in rows)
+        assert any("time/step_eval_s" in row for row in rows)
diff --git a/tests/unit/test_unsloth_metrics.py b/tests/unit/test_unsloth_metrics.py
new file mode 100644
index 000000000..fdb91b0c4
--- /dev/null
+++ b/tests/unit/test_unsloth_metrics.py
@@ -0,0 +1,25 @@
+import asyncio
+from collections import defaultdict
+
+from art.unsloth.train import get_log_fn
+
+
+class _DummyTrainer:
+    def __init__(self) -> None:
+        self._metrics = {"train": defaultdict(list)}
+
+
+def test_get_log_fn_routes_eval_metrics_to_val_namespace() -> None:
+    trainer = _DummyTrainer()
+    trainer._metrics["train"]["loss/train"].append(1.5)
+    trainer._metrics["train"]["loss/entropy"].append(0.2)
+    results_queue: asyncio.Queue[dict[str, float]] = asyncio.Queue()
+
+    log = get_log_fn(trainer, results_queue)
+    log({"eval_loss": 1.0, "eval_runtime": 2.0})
+
+    assert results_queue.get_nowait() == {
+        "val/loss/train": 1.0,
+        "val/loss/entropy": 0.2,
+        "val/runtime": 2.0,
+    }