From 501b730e81e97c519e7140963cd7b2b9db4f1ab1 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 16:55:26 -0800 Subject: [PATCH 01/46] test: Capture baseline metric routing behavior --- tests/unit/test_metric_routing.py | 66 +++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tests/unit/test_metric_routing.py diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py new file mode 100644 index 00000000..db297e22 --- /dev/null +++ b/tests/unit/test_metric_routing.py @@ -0,0 +1,66 @@ +import json +import os +from pathlib import Path +import types +from unittest.mock import MagicMock, patch + +from art import Model + + +class TestMetricRoutingBaseline: + def test_log_metrics_prefixes_all_keys_with_split(self, tmp_path: Path) -> None: + model = Model( + name="test-model", + project="test-project", + base_path=str(tmp_path), + report_metrics=[], + ) + + model._log_metrics( + { + "reward/mean": 0.9, + "custom": 1.0, + }, + split="train", + step=7, + ) + + history_path = tmp_path / "test-project/models/test-model/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["train/reward/mean"] == 0.9 + assert entry["train/custom"] == 1.0 + assert "reward/mean" not in entry + assert "training_step" not in entry + assert "time/wall_clock_sec" not in entry + + def test_get_wandb_run_registers_existing_sections(self, tmp_path: Path) -> None: + fake_run = MagicMock() + fake_run._is_finished = False + + fake_wandb = types.SimpleNamespace() + fake_wandb.init = MagicMock(return_value=fake_run) + fake_wandb.define_metric = MagicMock() + fake_wandb.Settings = lambda **kwargs: kwargs + + with patch.dict(os.environ, {"WANDB_API_KEY": "test-key"}, clear=False): + with patch.dict("sys.modules", {"wandb": fake_wandb}): + model = Model( + name="test-model", + project="test-project", + base_path=str(tmp_path), + ) + run = model._get_wandb_run() + + assert run is fake_run + define_calls = [ + (call.args, call.kwargs) + for call in fake_wandb.define_metric.call_args_list + ] + assert define_calls == [ + (("training_step",), {}), + (("train/*",), {"step_metric": "training_step"}), + (("val/*",), {"step_metric": "training_step"}), + (("costs/*",), {"step_metric": "training_step"}), + ] From 6d4f68938031e9eac2d16e6df4008f1e2be0fb20 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 16:57:17 -0800 Subject: [PATCH 02/46] feat: Add section-aware metric routing and W&B taxonomy registration --- src/art/model.py | 46 +++++++++++++++++++++++++++-- tests/unit/test_frontend_logging.py | 14 ++++++++- tests/unit/test_metric_routing.py | 24 ++++++++++----- 3 files changed, 73 insertions(+), 11 deletions(-) diff --git a/src/art/model.py b/src/art/model.py index a5b13582..7d5e7df2 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -2,6 +2,7 @@ from datetime import datetime import json import os +import time from typing import TYPE_CHECKING, Any, Generic, Iterable, Optional, cast, overload import warnings @@ -29,6 +30,19 @@ COSTS_STATE_KEY = "_costs" COSTS_METRIC_PREFIX = "costs_" COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total" +METRIC_SECTIONS = frozenset( + { + "reward", + "loss", + "offpolicy", + "pipeline", + "throughput", + "costs", + "time", + "data", + } +) +METRIC_SPLITS = frozenset({"train", "val", "test"}) class Model( @@ -93,6 +107,7 @@ class Model( _s3_prefix: str | None = None _openai_client: AsyncOpenAI | None = None _wandb_run: Optional["Run"] = None # Private, for lazy wandb initialization + _run_start_time: float _costs_lock: asyncio.Lock _cost_calculator: CostCalculator @@ -123,6 +138,7 @@ def __init__( report_metrics=report_metrics, **kwargs, ) + object.__setattr__(self, "_run_start_time", time.time()) @overload def __new__( @@ -380,9 +396,16 @@ def _get_wandb_run(self) -> Optional["Run"]: # Define training_step as the x-axis for all metrics. # This allows out-of-order logging (e.g., async validation for previous steps). wandb.define_metric("training_step") + wandb.define_metric("time/wall_clock_sec") + wandb.define_metric("reward/*", step_metric="training_step") + wandb.define_metric("loss/*", step_metric="training_step") + wandb.define_metric("throughput/*", step_metric="training_step") + wandb.define_metric("costs/*", step_metric="training_step") + wandb.define_metric("time/*", step_metric="training_step") + wandb.define_metric("data/*", step_metric="training_step") wandb.define_metric("train/*", step_metric="training_step") wandb.define_metric("val/*", step_metric="training_step") - wandb.define_metric("costs/*", step_metric="training_step") + wandb.define_metric("test/*", step_metric="training_step") return self._wandb_run def _log_metrics( @@ -392,7 +415,24 @@ def _log_metrics( step: int, ) -> None: """Log metrics to history.jsonl and optionally wandb.""" - prefixed = {f"{split}/{k}": v for k, v in metrics.items()} + if split in METRIC_SPLITS: + prefixed = {} + for key, value in metrics.items(): + first_component = key.split("/", 1)[0] + has_prefix_component = "/" in key + if has_prefix_component and ( + first_component in METRIC_SECTIONS + or first_component in METRIC_SPLITS + ): + prefixed[key] = value + else: + prefixed[f"{split}/{key}"] = value + else: + prefixed = {f"{split}/{k}": v for k, v in metrics.items()} + + prefixed["training_step"] = step + prefixed["time/wall_clock_sec"] = time.time() - self._run_start_time + output_dir = self._get_output_dir() # Ensure output directory exists @@ -416,7 +456,7 @@ def _log_metrics( ) or (self.report_metrics is not None and "wandb" in self.report_metrics) if should_log_wandb: if run := self._get_wandb_run(): - run.log({"training_step": step, **prefixed}) + run.log(prefixed) async def _record_costs( self, diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 20278589..ed5ed4a0 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -337,10 +337,22 @@ async def test_metric_prefixes(self, tmp_path: Path): entry = json.loads(f.readline()) # All metrics should be prefixed (except step and recorded_at) - metric_keys = [k for k in entry.keys() if k not in ["step", "recorded_at"]] + metric_keys = [ + k + for k in entry.keys() + if k + not in [ + "step", + "recorded_at", + "training_step", + "time/wall_clock_sec", + ] + ] assert all(k.startswith("val/") for k in metric_keys), ( f"Not all metrics prefixed: {metric_keys}" ) + assert entry["training_step"] == 0 + assert entry["time/wall_clock_sec"] >= 0 @pytest.mark.asyncio async def test_standard_metrics_present(self, tmp_path: Path): diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py index db297e22..e83a48ed 100644 --- a/tests/unit/test_metric_routing.py +++ b/tests/unit/test_metric_routing.py @@ -8,7 +8,9 @@ class TestMetricRoutingBaseline: - def test_log_metrics_prefixes_all_keys_with_split(self, tmp_path: Path) -> None: + def test_log_metrics_routes_known_sections_without_split_prefix( + self, tmp_path: Path + ) -> None: model = Model( name="test-model", project="test-project", @@ -20,6 +22,7 @@ def test_log_metrics_prefixes_all_keys_with_split(self, tmp_path: Path) -> None: { "reward/mean": 0.9, "custom": 1.0, + "rewardish/value": 2.0, }, split="train", step=7, @@ -29,13 +32,13 @@ def test_log_metrics_prefixes_all_keys_with_split(self, tmp_path: Path) -> None: with open(history_path) as f: entry = json.loads(f.readline()) - assert entry["train/reward/mean"] == 0.9 + assert entry["reward/mean"] == 0.9 assert entry["train/custom"] == 1.0 - assert "reward/mean" not in entry - assert "training_step" not in entry - assert "time/wall_clock_sec" not in entry + assert entry["train/rewardish/value"] == 2.0 + assert entry["training_step"] == 7 + assert entry["time/wall_clock_sec"] >= 0 - def test_get_wandb_run_registers_existing_sections(self, tmp_path: Path) -> None: + def test_get_wandb_run_registers_taxonomy_sections(self, tmp_path: Path) -> None: fake_run = MagicMock() fake_run._is_finished = False @@ -60,7 +63,14 @@ def test_get_wandb_run_registers_existing_sections(self, tmp_path: Path) -> None ] assert define_calls == [ (("training_step",), {}), + (("time/wall_clock_sec",), {}), + (("reward/*",), {"step_metric": "training_step"}), + (("loss/*",), {"step_metric": "training_step"}), + (("throughput/*",), {"step_metric": "training_step"}), + (("costs/*",), {"step_metric": "training_step"}), + (("time/*",), {"step_metric": "training_step"}), + (("data/*",), {"step_metric": "training_step"}), (("train/*",), {"step_metric": "training_step"}), (("val/*",), {"step_metric": "training_step"}), - (("costs/*",), {"step_metric": "training_step"}), + (("test/*",), {"step_metric": "training_step"}), ] From 498a0cdc0e0ba2e4dc8b2c2b2c0e0b4374a0add3 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 17:15:02 -0800 Subject: [PATCH 03/46] feat: Add MetricsBuilder with hierarchical cost rollups --- src/art/metrics.py | 174 +++++++++++++++++++++++++++++ tests/unit/test_metrics_builder.py | 161 ++++++++++++++++++++++++++ 2 files changed, 335 insertions(+) create mode 100644 src/art/metrics.py create mode 100644 tests/unit/test_metrics_builder.py diff --git a/src/art/metrics.py b/src/art/metrics.py new file mode 100644 index 00000000..4ff89f00 --- /dev/null +++ b/src/art/metrics.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +import asyncio +from contextvars import ContextVar, Token +from typing import Any + +_active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder") + +_HIERARCHICAL_SECTIONS = {"costs", "time", "data"} + + +class MetricsBuilder: + """Build and accumulate step-level metrics for logging.""" + + def __init__(self, cost_context: str) -> None: + if not cost_context: + raise ValueError("cost_context must be non-empty") + + self.cost_context = cost_context + self._lock = asyncio.Lock() + self._step_buffer: dict[str, float] = {} + self._cum_state: dict[str, float] = {} + self._unique_scenario_ids: set[str] = set() + + def add_cost(self, path: str, usd: float) -> None: + if not path: + raise ValueError("Cost path must be non-empty") + full_key = f"costs/{path}" + self._validate_and_add(full_key, float(usd)) + + def add_data( + self, + step_num_scenarios: int | None = None, + step_actor_tokens: int | None = None, + scenario_ids: list[str] | None = None, + ) -> None: + if step_num_scenarios is not None: + self._step_buffer["data/step_num_scenarios"] = float(step_num_scenarios) + if step_actor_tokens is not None: + self._step_buffer["data/step_actor_tokens"] = float(step_actor_tokens) + if scenario_ids is not None: + self._unique_scenario_ids.update(scenario_ids) + + def add_user_timing( + self, + step_wall_s: float | None = None, + step_actor_s: float | None = None, + step_eval_s: float | None = None, + ) -> None: + if step_wall_s is not None: + self._step_buffer["time/step_wall_s"] = float(step_wall_s) + if step_actor_s is not None: + self._step_buffer["time/step_actor_s"] = float(step_actor_s) + if step_eval_s is not None: + self._step_buffer["time/step_eval_s"] = float(step_eval_s) + + def add_idle_times( + self, + step_trainer_idle_s: float | None = None, + step_actor_idle_s: float | None = None, + ) -> None: + if step_trainer_idle_s is not None: + self._step_buffer["throughput/step_trainer_idle_s"] = float( + step_trainer_idle_s + ) + if step_actor_idle_s is not None: + self._step_buffer["throughput/step_actor_idle_s"] = float(step_actor_idle_s) + + async def flush(self, step: int) -> dict[str, float]: + del step + async with self._lock: + self._validate_hierarchy() + + result = dict(self._step_buffer) + cost_metrics = { + key: value + for key, value in self._step_buffer.items() + if key.startswith("costs/") + } + result.update(self._compute_rollups(cost_metrics)) + + for key, value in list(result.items()): + section = key.split("/", 1)[0] + if section not in _HIERARCHICAL_SECTIONS: + continue + cum_key = f"{key}_cum" + next_value = self._cum_state.get(cum_key, 0.0) + value + self._cum_state[cum_key] = next_value + result[cum_key] = next_value + + if self._unique_scenario_ids: + result["data/cum_num_unique_scenarios"] = float( + len(self._unique_scenario_ids) + ) + + self._step_buffer.clear() + return result + + def activate(self) -> Token["MetricsBuilder"]: + return _active_builder.set(self) + + @staticmethod + def get_active() -> "MetricsBuilder": + return _active_builder.get() + + def state_dict(self) -> dict[str, Any]: + return { + "cum_state": dict(self._cum_state), + "unique_scenario_ids": list(self._unique_scenario_ids), + } + + def load_state_dict(self, state: dict[str, Any]) -> None: + raw_cum_state = state.get("cum_state", {}) + raw_unique_ids = state.get("unique_scenario_ids", []) + self._cum_state = {str(k): float(v) for k, v in raw_cum_state.items()} + self._unique_scenario_ids = {str(v) for v in raw_unique_ids} + + def _validate_and_add(self, key: str, value: float) -> None: + if key.endswith("_cum"): + raise ValueError( + f"Metric key '{key}' ends with '_cum', which is reserved for cumulative metrics." + ) + + for existing_key in self._step_buffer: + if existing_key == key: + continue + if existing_key.startswith(f"{key}/"): + raise ValueError( + f"Cannot log '{key}' as a leaf: it is an ancestor of '{existing_key}'." + ) + if key.startswith(f"{existing_key}/"): + raise ValueError( + f"Cannot log '{key}' as a leaf: '{existing_key}' is already a leaf ancestor." + ) + + self._step_buffer[key] = self._step_buffer.get(key, 0.0) + value + + def _validate_hierarchy(self) -> None: + keys = sorted(k for k in self._step_buffer if k.startswith("costs/")) + for i, key in enumerate(keys): + for other in keys[i + 1 :]: + if other.startswith(f"{key}/"): + raise ValueError( + f"Leaf/parent conflict: '{key}' and '{other}' cannot coexist." + ) + + def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]: + if not cost_metrics: + return {} + + all_parents: set[str] = set() + for key in cost_metrics: + parts = key.split("/") + for depth in range(2, len(parts)): + all_parents.add("/".join(parts[:depth])) + + rollups: dict[str, float] = {} + for parent in all_parents: + prefix = f"{parent}/" + rollups[parent] = sum( + value for key, value in cost_metrics.items() if key.startswith(prefix) + ) + + top_level_children = {key.split("/")[1] for key in cost_metrics} + costs_all = 0.0 + for child_name in top_level_children: + child_key = f"costs/{child_name}" + if child_key in rollups: + costs_all += rollups[child_key] + else: + costs_all += cost_metrics[child_key] + rollups["costs/all"] = costs_all + + return rollups diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py new file mode 100644 index 00000000..7b56c150 --- /dev/null +++ b/tests/unit/test_metrics_builder.py @@ -0,0 +1,161 @@ +import asyncio + +import pytest + +from art.metrics import MetricsBuilder + + +class TestMetricsBuilder: + @pytest.mark.asyncio + async def test_rollup_correctness_across_depths(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_cost("train/llm_judge/general_judge", usd=0.08) + builder.add_cost("train/llm_judge/hallucination_judge", usd=0.04) + builder.add_cost("train/tinker_train", usd=1.20) + builder.add_cost("train/tinker_inference", usd=0.45) + builder.add_cost("eval/llm_judge/correctness", usd=0.06) + + metrics = await builder.flush(step=1) + + assert metrics["costs/train/llm_judge"] == pytest.approx(0.12) + assert metrics["costs/train"] == pytest.approx(1.77) + assert metrics["costs/eval"] == pytest.approx(0.06) + assert metrics["costs/all"] == pytest.approx(1.83) + assert metrics["costs/train/llm_judge_cum"] == pytest.approx(0.12) + assert metrics["costs/train_cum"] == pytest.approx(1.77) + assert metrics["costs/all_cum"] == pytest.approx(1.83) + + @pytest.mark.asyncio + async def test_cum_accumulates_for_hierarchical_sections(self) -> None: + builder = MetricsBuilder(cost_context="train") + + builder.add_user_timing(step_wall_s=1.5, step_actor_s=0.3) + builder.add_data( + step_num_scenarios=2, + step_actor_tokens=10, + scenario_ids=["a", "b"], + ) + first = await builder.flush(step=1) + + assert first["time/step_wall_s_cum"] == pytest.approx(1.5) + assert first["time/step_actor_s_cum"] == pytest.approx(0.3) + assert first["data/step_num_scenarios_cum"] == pytest.approx(2) + assert first["data/step_actor_tokens_cum"] == pytest.approx(10) + assert first["data/cum_num_unique_scenarios"] == 2 + + builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2) + builder.add_data( + step_num_scenarios=3, + step_actor_tokens=5, + scenario_ids=["b", "c"], + ) + second = await builder.flush(step=2) + + assert second["time/step_wall_s_cum"] == pytest.approx(2.0) + assert second["time/step_actor_s_cum"] == pytest.approx(0.5) + assert second["data/step_num_scenarios_cum"] == pytest.approx(5) + assert second["data/step_actor_tokens_cum"] == pytest.approx(15) + assert second["data/cum_num_unique_scenarios"] == 3 + + @pytest.mark.asyncio + async def test_costs_all_generated_for_single_and_multiple_children(self) -> None: + single = MetricsBuilder(cost_context="train") + single.add_cost("train/gpu", usd=2.0) + one = await single.flush(step=1) + assert one["costs/all"] == pytest.approx(2.0) + + multi = MetricsBuilder(cost_context="train") + multi.add_cost("train/gpu", usd=2.0) + multi.add_cost("eval/llm_judge/correctness", usd=0.5) + two = await multi.flush(step=1) + assert two["costs/all"] == pytest.approx(2.5) + + def test_leaf_parent_conflicts_raise(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_cost("train", usd=1.0) + with pytest.raises(ValueError): + builder.add_cost("train/llm_judge", usd=0.1) + + other = MetricsBuilder(cost_context="train") + other.add_cost("train/llm_judge", usd=0.1) + with pytest.raises(ValueError): + other.add_cost("train", usd=1.0) + + @pytest.mark.asyncio + async def test_duplicate_leaf_writes_are_summed(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_cost("train/gpu", usd=1.25) + builder.add_cost("train/gpu", usd=0.75) + + metrics = await builder.flush(step=1) + + assert metrics["costs/train/gpu"] == pytest.approx(2.0) + assert metrics["costs/train"] == pytest.approx(2.0) + assert metrics["costs/all"] == pytest.approx(2.0) + + def test_cum_suffix_is_reserved(self) -> None: + builder = MetricsBuilder(cost_context="train") + with pytest.raises(ValueError): + builder.add_cost("train/llm_judge_cum", usd=0.1) + + @pytest.mark.asyncio + async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_cost("train/gpu", usd=1.0) + first = await builder.flush(step=1) + assert first["costs/train_cum"] == pytest.approx(1.0) + + second = await builder.flush(step=2) + assert not any(key.startswith("costs/") for key in second) + + builder.add_cost("train/gpu", usd=2.0) + third = await builder.flush(step=3) + assert third["costs/train"] == pytest.approx(2.0) + assert third["costs/train_cum"] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None: + before = MetricsBuilder(cost_context="train") + before.add_cost("train/gpu", usd=1.0) + await before.flush(step=1) + + state = before.state_dict() + after = MetricsBuilder(cost_context="train") + after.load_state_dict(state) + after.add_cost("train/gpu", usd=2.0) + + metrics = await after.flush(step=2) + assert metrics["costs/train_cum"] == pytest.approx(3.0) + assert metrics["costs/all_cum"] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_unique_scenario_count_tracks_exact_ids(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_data(scenario_ids=["s1", "s2", "s3"]) + first = await builder.flush(step=1) + assert first["data/cum_num_unique_scenarios"] == 3 + + builder.add_data(scenario_ids=["s2", "s4"]) + second = await builder.flush(step=2) + assert second["data/cum_num_unique_scenarios"] == 4 + + @pytest.mark.asyncio + async def test_concurrent_add_cost_calls_do_not_lose_updates(self) -> None: + builder = MetricsBuilder(cost_context="train") + + async def worker() -> None: + for _ in range(25): + builder.add_cost("train/gpu", usd=0.1) + await asyncio.sleep(0) + + await asyncio.gather(*(worker() for _ in range(4))) + metrics = await builder.flush(step=1) + + assert metrics["costs/train/gpu"] == pytest.approx(10.0) + assert metrics["costs/all"] == pytest.approx(10.0) + + def test_contextvar_activate_and_get_active(self) -> None: + builder = MetricsBuilder(cost_context="eval") + token = builder.activate() + assert MetricsBuilder.get_active() is builder + token.var.reset(token) From c4e848c5c579dc0a7cc3a0476af25ca21238b635 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 17:25:52 -0800 Subject: [PATCH 04/46] test: Capture baseline train trajectory metric routing --- tests/unit/test_frontend_logging.py | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index ed5ed4a0..83251f7e 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -467,6 +467,42 @@ async def test_exception_rate_calculation(self, tmp_path: Path): # All successful trajectories = 0% exception rate assert entry["val/exception_rate"] == 0.0 + @pytest.mark.asyncio + async def test_train_trajectory_metrics_default_to_train_prefix_baseline( + self, tmp_path: Path + ): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + trajectories = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.7, + metrics={ + "custom_score": 1.0, + "reward/prefixed": 2.0, + }, + messages_and_choices=[{"role": "user", "content": "test"}], + ) + ], + exceptions=[], + ) + ] + + await model.log(trajectories, split="train") + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["train/custom_score"] == 1.0 + assert entry["reward/prefixed"] == 2.0 + class TestWandbIntegration: """Test wandb integration logic (without mocking wandb itself).""" From 1d9bf2616bc87f1d05544d5133b040a4d7ff799d Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 17:27:52 -0800 Subject: [PATCH 05/46] feat: Route train trajectory metrics and log costs via MetricsBuilder --- src/art/model.py | 189 ++++++++-------------------- tests/unit/test_frontend_logging.py | 47 ++++++- 2 files changed, 97 insertions(+), 139 deletions(-) diff --git a/src/art/model.py b/src/art/model.py index 7d5e7df2..18a1a9ac 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -14,6 +14,7 @@ from . import dev from .costs import CostCalculator +from .metrics import MetricsBuilder from .trajectories import Trajectory, TrajectoryGroup from .types import TrainConfig, TrainSFTConfig from .utils.trajectory_logging import write_trajectory_groups_parquet @@ -27,7 +28,6 @@ ModelConfig = TypeVar("ModelConfig", bound=BaseModel | None) StateType = TypeVar("StateType", bound=dict[str, Any], default=dict[str, Any]) -COSTS_STATE_KEY = "_costs" COSTS_METRIC_PREFIX = "costs_" COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total" METRIC_SECTIONS = frozenset( @@ -108,7 +108,7 @@ class Model( _openai_client: AsyncOpenAI | None = None _wandb_run: Optional["Run"] = None # Private, for lazy wandb initialization _run_start_time: float - _costs_lock: asyncio.Lock + _metrics_builder: MetricsBuilder _cost_calculator: CostCalculator def __init__( @@ -139,6 +139,7 @@ def __init__( **kwargs, ) object.__setattr__(self, "_run_start_time", time.time()) + object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train")) @overload def __new__( @@ -458,63 +459,30 @@ def _log_metrics( if run := self._get_wandb_run(): run.log(prefixed) - async def _record_costs( - self, - split: str, - step: int, - *, - cost_components: dict[str, float], - cost_total_direct: float, - cost_seen: bool, - ) -> None: - component_total = sum(cost_components.values()) - step_total = component_total if component_total > 0 else cost_total_direct - if not cost_seen or step_total <= 0: - return - - async with self._costs_lock: - existing_state = self.read_state() or {} - raw_costs = existing_state.get(COSTS_STATE_KEY) or {} - cumulative = { - key: float(value) - for key, value in raw_costs.items() - if isinstance(value, (int, float)) - } - last_steps = raw_costs.get("_last_steps") - if not isinstance(last_steps, dict): - last_steps = {} - last_step = last_steps.get(split) - - if isinstance(last_step, (int, float)) and int(last_step) >= step: - for component, value in cost_components.items(): - if value == 0: - continue - cumulative_key = f"{split}_{component}" - cumulative[cumulative_key] = max( - cumulative.get(cumulative_key, 0.0), value - ) - cumulative[split] = max(cumulative.get(split, 0.0), step_total) - cumulative["total"] = max( - cumulative.get("total", 0.0), cumulative.get(split, 0.0) - ) - self.merge_state( - {COSTS_STATE_KEY: {**cumulative, "_last_steps": last_steps}} + def _extract_non_cost_metrics( + self, metrics: dict[str, float], split: str + ) -> dict[str, float]: + non_cost_metrics: dict[str, float] = {} + cost_context = "train" if split == "train" else "eval" + for metric, value in metrics.items(): + numeric_value = float(value) + if metric == COSTS_TOTAL_KEY: + raise ValueError( + "Do not log 'costs_total' directly. Log costs_* components " + "(e.g., costs_prefill, costs_sample) and totals are derived." ) - self._log_metrics(cumulative, "costs", step) - return - - for component, value in cost_components.items(): - if value == 0: - continue - cumulative_key = f"{split}_{component}" - cumulative[cumulative_key] = cumulative.get(cumulative_key, 0.0) + value - cumulative[split] = cumulative.get(split, 0.0) + step_total - cumulative["total"] = cumulative.get("total", 0.0) + step_total - last_steps[split] = step - self.merge_state( - {COSTS_STATE_KEY: {**cumulative, "_last_steps": last_steps}} - ) - self._log_metrics(cumulative, "costs", step) + if metric.startswith("costs/"): + self._metrics_builder.add_cost(metric[len("costs/") :], numeric_value) + continue + if metric.startswith(COSTS_METRIC_PREFIX): + component = metric[len(COSTS_METRIC_PREFIX) :] + if component: + self._metrics_builder.add_cost( + f"{cost_context}/{component}", numeric_value + ) + continue + non_cost_metrics[metric] = numeric_value + return non_cost_metrics async def log( self, @@ -549,42 +517,12 @@ async def log( # If only metrics provided (no trajectories), just log them and return if trajectories is None: if metrics is not None: - cost_step = await self.get_step() - cost_components: dict[str, float] = {} - cost_total_direct = 0.0 - cost_seen = False - - for metric, value in metrics.items(): - if not isinstance(value, (int, float)): - continue - if metric == COSTS_TOTAL_KEY: - raise ValueError( - "Do not log 'costs_total' directly. Log costs_* components " - "(e.g., costs_prefill, costs_sample) and totals are derived." - ) - elif metric.startswith(COSTS_METRIC_PREFIX): - component = metric[len(COSTS_METRIC_PREFIX) :] - if component: - cost_components[component] = cost_components.get( - component, 0.0 - ) + float(value) - cost_seen = True - - metrics_without_costs = { - key: value - for key, value in metrics.items() - if not key.startswith(COSTS_METRIC_PREFIX) - } + metrics_without_costs = self._extract_non_cost_metrics(metrics, split) if metrics_without_costs: self._log_metrics(metrics_without_costs, split, step) - - await self._record_costs( - split, - cost_step, - cost_components=cost_components, - cost_total_direct=cost_total_direct, - cost_seen=cost_seen, - ) + costs = await self._metrics_builder.flush(step) + if costs: + self._log_metrics(costs, split, step) return # Convert to list[TrajectoryGroup] @@ -611,38 +549,18 @@ async def log( ) # 2. Calculate aggregate metrics (excluding additive costs) - cost_step = await self.get_step() all_metrics: dict[str, list[float]] = {"reward": [], "exception_rate": []} group_metrics: dict[str, list[float]] = {} - cost_components: dict[str, float] = {} - cost_total_direct = 0.0 - cost_seen = False - - def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None: - nonlocal cost_total_direct, cost_seen - for metric, value in metrics_dict.items(): - if not isinstance(value, (int, float)): - continue - if metric == COSTS_TOTAL_KEY: - raise ValueError( - "Do not log 'costs_total' directly. Log costs_* components " - "(e.g., costs_prefill, costs_sample) and totals are derived." - ) - elif metric.startswith(COSTS_METRIC_PREFIX): - component = metric[len(COSTS_METRIC_PREFIX) :] - if component: - cost_components[component] = cost_components.get( - component, 0.0 - ) + float(value) - cost_seen = True for group in trajectory_groups: if group.metrics: - _add_costs(group.metrics) + group_non_cost = self._extract_non_cost_metrics( + cast(dict[str, float], group.metrics), split + ) + else: + group_non_cost = {} if group.trajectories: - for metric, value in group.metrics.items(): - if metric.startswith(COSTS_METRIC_PREFIX): - continue + for metric, value in group_non_cost.items(): if metric not in group_metrics: group_metrics[metric] = [] group_metrics[metric].append(float(value)) @@ -656,14 +574,21 @@ def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None: all_metrics["reward"].append(trajectory.reward) # Collect other custom metrics + trajectory_metrics: dict[str, float] = {} for metric, value in trajectory.metrics.items(): - if metric.startswith(COSTS_METRIC_PREFIX): - continue + routed_metric = metric + if split == "train" and "/" not in routed_metric: + routed_metric = f"reward/{routed_metric}" + trajectory_metrics[routed_metric] = float(value) + + non_cost_trajectory_metrics = self._extract_non_cost_metrics( + trajectory_metrics, + split, + ) + for metric, value in non_cost_trajectory_metrics.items(): if metric not in all_metrics: all_metrics[metric] = [] all_metrics[metric].append(float(value)) - if trajectory.metrics: - _add_costs(trajectory.metrics) # Calculate averages for all metrics averages: dict[str, float] = {} @@ -685,25 +610,16 @@ def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None: # Merge in any additional metrics passed directly if metrics is not None: - _add_costs(metrics) - metrics_without_costs = { - key: value - for key, value in metrics.items() - if not key.startswith(COSTS_METRIC_PREFIX) - } + metrics_without_costs = self._extract_non_cost_metrics(metrics, split) averages.update(metrics_without_costs) # 3. Log metrics (writes to history.jsonl and wandb) self._log_metrics(averages, split, step) - # 4. Log cumulative costs (additive) - await self._record_costs( - split, - cost_step, - cost_components=cost_components, - cost_total_direct=cost_total_direct, - cost_seen=cost_seen, - ) + # 4. Log cumulative costs + costs = await self._metrics_builder.flush(step) + if costs: + self._log_metrics(costs, split, step) async def get_step(self) -> int: """ @@ -754,7 +670,6 @@ def __init__( report_metrics=report_metrics, **kwargs, ) - object.__setattr__(self, "_costs_lock", asyncio.Lock()) object.__setattr__(self, "_cost_calculator", self._noop_cost_calculator) if _internal_config is not None: # Bypass BaseModel __setattr__ to allow setting private attr diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 83251f7e..1870f933 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -468,7 +468,7 @@ async def test_exception_rate_calculation(self, tmp_path: Path): assert entry["val/exception_rate"] == 0.0 @pytest.mark.asyncio - async def test_train_trajectory_metrics_default_to_train_prefix_baseline( + async def test_train_trajectory_metrics_default_to_reward_prefix( self, tmp_path: Path ): model = Model( @@ -500,9 +500,52 @@ async def test_train_trajectory_metrics_default_to_train_prefix_baseline( with open(history_path) as f: entry = json.loads(f.readline()) - assert entry["train/custom_score"] == 1.0 + assert entry["reward/custom_score"] == 1.0 assert entry["reward/prefixed"] == 2.0 + @pytest.mark.asyncio + async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + await model.log( + trajectories=None, + split="train", + step=1, + metrics={ + "costs_prefill": 0.2, + "costs_sample": 0.3, + }, + ) + await model.log( + trajectories=None, + split="train", + step=2, + metrics={ + "costs_prefill": 0.1, + }, + ) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + first = json.loads(f.readline()) + second = json.loads(f.readline()) + + assert first["costs/train/prefill"] == pytest.approx(0.2) + assert first["costs/train/sample"] == pytest.approx(0.3) + assert first["costs/train"] == pytest.approx(0.5) + assert first["costs/all"] == pytest.approx(0.5) + assert first["costs/all_cum"] == pytest.approx(0.5) + + assert second["costs/train/prefill"] == pytest.approx(0.1) + assert second["costs/train/prefill_cum"] == pytest.approx(0.3) + assert second["costs/train_cum"] == pytest.approx(0.6) + assert second["costs/all_cum"] == pytest.approx(0.6) + class TestWandbIntegration: """Test wandb integration logic (without mocking wandb itself).""" From 89a58e18829c6bf77420c146ee37452524f95209 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 17:30:42 -0800 Subject: [PATCH 06/46] feat: Rename train metrics to reward, loss, and throughput sections --- src/art/model.py | 58 ++++++++++++++++--- .../binary_prefix_tool_pipeline.py | 6 +- tests/unit/test_frontend_logging.py | 9 ++- 3 files changed, 60 insertions(+), 13 deletions(-) diff --git a/src/art/model.py b/src/art/model.py index 18a1a9ac..eece4e9f 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -43,6 +43,25 @@ } ) METRIC_SPLITS = frozenset({"train", "val", "test"}) +TRAIN_METRIC_KEY_RENAMES = { + "reward": "reward/mean", + "reward_std_dev": "reward/std_dev", + "exception_rate": "reward/exception_rate", + "policy_loss": "loss/train", + "loss": "loss/train", + "entropy": "loss/entropy", + "kl_div": "loss/kl_div", + "kl_policy_ref": "loss/kl_policy_ref", + "grad_norm": "loss/grad_norm", + "learning_rate": "loss/learning_rate", + "tokens_per_second": "throughput/train_tok_per_sec", + "num_groups_submitted": "train/num_groups_submitted", + "num_groups_trainable": "train/num_groups_trainable", + "num_trajectories": "train/num_trajectories", + "num_trainable_tokens": "train/num_trainable_tokens", + "train_tokens": "data/step_trainer_tokens", + "num_datums": "data/step_num_datums", +} class Model( @@ -481,9 +500,18 @@ def _extract_non_cost_metrics( f"{cost_context}/{component}", numeric_value ) continue - non_cost_metrics[metric] = numeric_value + routed_metric = self._rename_train_metric_key(metric, split) + non_cost_metrics[routed_metric] = numeric_value return non_cost_metrics + @staticmethod + def _rename_train_metric_key(metric: str, split: str) -> str: + if split != "train": + return metric + if metric.startswith("group_metric_"): + return f"reward/group_{metric[len('group_metric_'):]}" + return TRAIN_METRIC_KEY_RENAMES.get(metric, metric) + async def log( self, trajectories: ( @@ -549,7 +577,16 @@ async def log( ) # 2. Calculate aggregate metrics (excluding additive costs) - all_metrics: dict[str, list[float]] = {"reward": [], "exception_rate": []} + reward_key = "reward/mean" if split == "train" else "reward" + exception_rate_key = ( + "reward/exception_rate" if split == "train" else "exception_rate" + ) + reward_std_dev_key = "reward/std_dev" if split == "train" else "reward_std_dev" + + all_metrics: dict[str, list[float]] = { + reward_key: [], + exception_rate_key: [], + } group_metrics: dict[str, list[float]] = {} for group in trajectory_groups: @@ -566,12 +603,12 @@ async def log( group_metrics[metric].append(float(value)) for trajectory in group: if isinstance(trajectory, BaseException): - all_metrics["exception_rate"].append(1) + all_metrics[exception_rate_key].append(1) continue else: - all_metrics["exception_rate"].append(0) + all_metrics[exception_rate_key].append(0) # Add reward metric - all_metrics["reward"].append(trajectory.reward) + all_metrics[reward_key].append(trajectory.reward) # Collect other custom metrics trajectory_metrics: dict[str, float] = {} @@ -599,14 +636,17 @@ async def log( # Aggregate group-level metrics once per group for metric, values in group_metrics.items(): if len(values) > 0: - averages[f"group_metric_{metric}"] = sum(values) / len(values) + group_key = ( + f"reward/group_{metric}" if split == "train" else f"group_metric_{metric}" + ) + averages[group_key] = sum(values) / len(values) # Calculate average standard deviation of rewards within groups from .utils.old_benchmarking.calculate_step_metrics import ( calculate_step_std_dev, ) - averages["reward_std_dev"] = calculate_step_std_dev(trajectory_groups) + averages[reward_std_dev_key] = calculate_step_std_dev(trajectory_groups) # Merge in any additional metrics passed directly if metrics is not None: @@ -900,6 +940,10 @@ async def train_sft( / sum(1 for d in training_metrics if k in d) for k in {k for d in training_metrics for k in d} } + avg_metrics = { + self._rename_train_metric_key(key, "train"): value + for key, value in avg_metrics.items() + } # Get the current step after training step = await self.get_step() self._log_metrics(avg_metrics, "train", step) diff --git a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py index 52c82975..bc2f5a04 100644 --- a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py +++ b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py @@ -159,12 +159,12 @@ def print_history_summary(model: art.TrainableModel, tail: int = 5) -> None: rows = pl.read_ndjson(str(history_path)).to_dicts() - train_rows = [row for row in rows if "train/reward" in row] + train_rows = [row for row in rows if "reward/mean" in row] print("\nRecent training metrics:") for row in train_rows[-tail:]: step = row["step"] - reward = row["train/reward"] - std_dev = row["train/reward_std_dev"] + reward = row["reward/mean"] + std_dev = row["reward/std_dev"] discarded = row["train/discarded_stale_samples"] off_policy = row["train/steps_off_policy"] print( diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 1870f933..89be518a 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -231,7 +231,7 @@ async def test_history_appends_entries( # Check both splits are present columns = df.columns assert any("val/" in col for col in columns) - assert any("train/" in col for col in columns) + assert any("reward/" in col for col in columns) class TestPathStructure: @@ -500,6 +500,9 @@ async def test_train_trajectory_metrics_default_to_reward_prefix( with open(history_path) as f: entry = json.loads(f.readline()) + assert entry["reward/mean"] == 0.7 + assert entry["reward/exception_rate"] == 0.0 + assert "train/reward" not in entry assert entry["reward/custom_score"] == 1.0 assert entry["reward/prefixed"] == 2.0 @@ -719,8 +722,8 @@ async def mock_train_sft(*args, **kwargs): # Verify metrics are aggregated (averaged) entry = json.loads(lines[0]) assert entry["step"] == 1 - assert entry["train/loss"] == pytest.approx(0.8) # (1.0 + 0.8 + 0.6) / 3 - assert entry["train/grad_norm"] == pytest.approx(0.4) # (0.5 + 0.4 + 0.3) / 3 + assert entry["loss/train"] == pytest.approx(0.8) # (1.0 + 0.8 + 0.6) / 3 + assert entry["loss/grad_norm"] == pytest.approx(0.4) # (0.5 + 0.4 + 0.3) / 3 @pytest.mark.asyncio async def test_train_sft_single_step_increment(self, tmp_path: Path): From 20f99674b0dacf0fa656a8b1334161713f739c40 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 17:31:50 -0800 Subject: [PATCH 07/46] feat: Persist MetricsBuilder cumulative state across resume --- src/art/model.py | 21 ++++++++++++++++ tests/unit/test_frontend_logging.py | 39 +++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/src/art/model.py b/src/art/model.py index eece4e9f..6f6cd470 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -30,6 +30,7 @@ COSTS_METRIC_PREFIX = "costs_" COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total" +METRICS_BUILDER_STATE_KEY = "_metrics_builder_state" METRIC_SECTIONS = frozenset( { "reward", @@ -128,6 +129,7 @@ class Model( _wandb_run: Optional["Run"] = None # Private, for lazy wandb initialization _run_start_time: float _metrics_builder: MetricsBuilder + _metrics_builder_state_loaded: bool _cost_calculator: CostCalculator def __init__( @@ -159,6 +161,7 @@ def __init__( ) object.__setattr__(self, "_run_start_time", time.time()) object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train")) + object.__setattr__(self, "_metrics_builder_state_loaded", False) @overload def __new__( @@ -504,6 +507,20 @@ def _extract_non_cost_metrics( non_cost_metrics[routed_metric] = numeric_value return non_cost_metrics + def _load_metrics_builder_state(self) -> None: + if self._metrics_builder_state_loaded: + return + state = self.read_state() or {} + metrics_state = state.get(METRICS_BUILDER_STATE_KEY) + if isinstance(metrics_state, dict): + self._metrics_builder.load_state_dict(metrics_state) + object.__setattr__(self, "_metrics_builder_state_loaded", True) + + def _persist_metrics_builder_state(self) -> None: + self.merge_state( + {METRICS_BUILDER_STATE_KEY: self._metrics_builder.state_dict()} + ) + @staticmethod def _rename_train_metric_key(metric: str, split: str) -> str: if split != "train": @@ -542,6 +559,8 @@ async def log( if step is None: step = await self.get_step() if self.trainable else 0 + self._load_metrics_builder_state() + # If only metrics provided (no trajectories), just log them and return if trajectories is None: if metrics is not None: @@ -551,6 +570,7 @@ async def log( costs = await self._metrics_builder.flush(step) if costs: self._log_metrics(costs, split, step) + self._persist_metrics_builder_state() return # Convert to list[TrajectoryGroup] @@ -660,6 +680,7 @@ async def log( costs = await self._metrics_builder.flush(step) if costs: self._log_metrics(costs, split, step) + self._persist_metrics_builder_state() async def get_step(self) -> int: """ diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 89be518a..4d0415fc 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -549,6 +549,45 @@ async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path): assert second["costs/train_cum"] == pytest.approx(0.6) assert second["costs/all_cum"] == pytest.approx(0.6) + @pytest.mark.asyncio + async def test_cost_cumulative_persists_across_model_recreation( + self, tmp_path: Path + ): + model_1 = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + await model_1.log( + trajectories=None, + split="train", + step=1, + metrics={"costs_prefill": 0.25}, + ) + + model_2 = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + await model_2.log( + trajectories=None, + split="train", + step=2, + metrics={"costs_prefill": 0.75}, + ) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + first = json.loads(f.readline()) + second = json.loads(f.readline()) + + assert first["costs/train/prefill_cum"] == pytest.approx(0.25) + assert second["costs/train/prefill_cum"] == pytest.approx(1.0) + assert second["costs/all_cum"] == pytest.approx(1.0) + class TestWandbIntegration: """Test wandb integration logic (without mocking wandb itself).""" From 3e5ab1b66d47c73f29492c29a2ff005d878677cf Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 17:42:55 -0800 Subject: [PATCH 08/46] feat: Emit canonical train metric keys at source --- src/art/local/backend.py | 14 ++++++----- src/art/megatron/train.py | 4 ++-- src/art/metrics_taxonomy.py | 31 ++++++++++++++++++++++++ src/art/model.py | 37 +++-------------------------- src/art/serverless/backend.py | 19 ++++++++++++--- src/art/tinker/service.py | 2 +- src/art/tinker_native/backend.py | 15 +++++++----- src/art/unsloth/service.py | 12 +++++----- src/art/unsloth/train.py | 16 +++++++------ tests/unit/test_frontend_logging.py | 18 +++++++++++--- 10 files changed, 100 insertions(+), 68 deletions(-) create mode 100644 src/art/metrics_taxonomy.py diff --git a/src/art/local/backend.py b/src/art/local/backend.py index b74c0b05..aecef80a 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -39,6 +39,7 @@ from .. import dev from ..backend import AnyTrainableModel, Backend +from ..metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY, rename_train_metrics from ..model import Model, TrainableModel from ..preprocessing.pack import ( PackedTensors, @@ -579,7 +580,7 @@ async def train( # type: ignore[override] k: sum(d.get(k, 0) for d in training_metrics) / sum(1 for d in training_metrics if k in d) for k in {k for d in training_metrics for k in d} - if k != "num_gradient_steps" + if k != TRAIN_GRADIENT_STEPS_KEY } # Get step and checkpoint path @@ -686,9 +687,9 @@ async def _train_model( # Yield metrics showing no groups were trainable # (the frontend will handle logging) yield { - "num_groups_submitted": num_groups_submitted, - "num_groups_trainable": 0, - "num_gradient_steps": 0, + "train/num_groups_submitted": float(num_groups_submitted), + "train/num_groups_trainable": 0.0, + TRAIN_GRADIENT_STEPS_KEY: 0.0, } return disk_packed_tensors = packed_tensors_to_dir( @@ -701,14 +702,15 @@ async def _train_model( async for result in service.train( disk_packed_tensors, config, dev_config, verbose ): + result = rename_train_metrics(result) num_gradient_steps = int( - result.pop("num_gradient_steps", estimated_gradient_steps) + result.pop(TRAIN_GRADIENT_STEPS_KEY, estimated_gradient_steps) ) assert num_gradient_steps == estimated_gradient_steps, ( f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}" ) results.append(result) - yield {**result, "num_gradient_steps": num_gradient_steps} + yield {**result, TRAIN_GRADIENT_STEPS_KEY: float(num_gradient_steps)} pbar.update(1) pbar.set_postfix(result) pbar.close() diff --git a/src/art/megatron/train.py b/src/art/megatron/train.py index 876611a6..3441a0c4 100644 --- a/src/art/megatron/train.py +++ b/src/art/megatron/train.py @@ -282,8 +282,8 @@ def print0(*values: Any) -> None: with open("/tmp/megatron_training_log.jsonl", "a+") as log_file: log_msg = json.dumps( { - "loss": loss.item(), - "grad_norm": grad_norm, + "loss/train": loss.item(), + "loss/grad_norm": grad_norm, "probs_corr": probs_corr, } ) diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py new file mode 100644 index 00000000..e7d108d4 --- /dev/null +++ b/src/art/metrics_taxonomy.py @@ -0,0 +1,31 @@ +TRAIN_GRADIENT_STEPS_KEY = "data/step_num_gradient_steps" + +TRAIN_METRIC_KEY_RENAMES = { + "reward": "reward/mean", + "reward_std_dev": "reward/std_dev", + "exception_rate": "reward/exception_rate", + "policy_loss": "loss/train", + "loss": "loss/train", + "entropy": "loss/entropy", + "kl_div": "loss/kl_div", + "kl_policy_ref": "loss/kl_policy_ref", + "grad_norm": "loss/grad_norm", + "learning_rate": "loss/learning_rate", + "tokens_per_second": "throughput/train_tok_per_sec", + "num_groups_submitted": "train/num_groups_submitted", + "num_groups_trainable": "train/num_groups_trainable", + "num_trajectories": "train/num_trajectories", + "num_trainable_tokens": "train/num_trainable_tokens", + "train_tokens": "data/step_trainer_tokens", + "num_datums": "data/step_num_datums", +} + + +def rename_train_metric_key(metric: str) -> str: + if metric.startswith("group_metric_"): + return f"reward/group_{metric[len('group_metric_'):]}" + return TRAIN_METRIC_KEY_RENAMES.get(metric, metric) + + +def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]: + return {rename_train_metric_key(key): float(value) for key, value in metrics.items()} diff --git a/src/art/model.py b/src/art/model.py index 6f6cd470..afe0073d 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -15,6 +15,7 @@ from . import dev from .costs import CostCalculator from .metrics import MetricsBuilder +from .metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY from .trajectories import Trajectory, TrajectoryGroup from .types import TrainConfig, TrainSFTConfig from .utils.trajectory_logging import write_trajectory_groups_parquet @@ -44,25 +45,6 @@ } ) METRIC_SPLITS = frozenset({"train", "val", "test"}) -TRAIN_METRIC_KEY_RENAMES = { - "reward": "reward/mean", - "reward_std_dev": "reward/std_dev", - "exception_rate": "reward/exception_rate", - "policy_loss": "loss/train", - "loss": "loss/train", - "entropy": "loss/entropy", - "kl_div": "loss/kl_div", - "kl_policy_ref": "loss/kl_policy_ref", - "grad_norm": "loss/grad_norm", - "learning_rate": "loss/learning_rate", - "tokens_per_second": "throughput/train_tok_per_sec", - "num_groups_submitted": "train/num_groups_submitted", - "num_groups_trainable": "train/num_groups_trainable", - "num_trajectories": "train/num_trajectories", - "num_trainable_tokens": "train/num_trainable_tokens", - "train_tokens": "data/step_trainer_tokens", - "num_datums": "data/step_num_datums", -} class Model( @@ -503,8 +485,7 @@ def _extract_non_cost_metrics( f"{cost_context}/{component}", numeric_value ) continue - routed_metric = self._rename_train_metric_key(metric, split) - non_cost_metrics[routed_metric] = numeric_value + non_cost_metrics[metric] = numeric_value return non_cost_metrics def _load_metrics_builder_state(self) -> None: @@ -521,14 +502,6 @@ def _persist_metrics_builder_state(self) -> None: {METRICS_BUILDER_STATE_KEY: self._metrics_builder.state_dict()} ) - @staticmethod - def _rename_train_metric_key(metric: str, split: str) -> str: - if split != "train": - return metric - if metric.startswith("group_metric_"): - return f"reward/group_{metric[len('group_metric_'):]}" - return TRAIN_METRIC_KEY_RENAMES.get(metric, metric) - async def log( self, trajectories: ( @@ -913,7 +886,7 @@ async def train( k: sum(d.get(k, 0) for d in training_metrics) / sum(1 for d in training_metrics if k in d) for k in {k for d in training_metrics for k in d} - if k != "num_gradient_steps" + if k != TRAIN_GRADIENT_STEPS_KEY } # 3. Log trajectories and training metrics together (single wandb log call) @@ -961,10 +934,6 @@ async def train_sft( / sum(1 for d in training_metrics if k in d) for k in {k for d in training_metrics for k in d} } - avg_metrics = { - self._rename_train_metric_key(key, "train"): value - for key, value in avg_metrics.items() - } # Get the current step after training step = await self.get_step() self._log_metrics(avg_metrics, "train", step) diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py index abf67f69..f9ab8c5f 100644 --- a/src/art/serverless/backend.py +++ b/src/art/serverless/backend.py @@ -9,6 +9,7 @@ from .. import dev from ..backend import AnyTrainableModel, Backend +from ..metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY, rename_train_metrics from ..trajectories import Trajectory, TrajectoryGroup from ..types import ServerlessTrainResult, TrainConfig, TrainSFTConfig from ..utils.record_provenance import record_provenance @@ -247,7 +248,7 @@ async def train( # type: ignore[override] k: sum(d.get(k, 0) for d in training_metrics) / sum(1 for d in training_metrics if k in d) for k in {k for d in training_metrics for k in d} - if k != "num_gradient_steps" + if k != TRAIN_GRADIENT_STEPS_KEY } # Get step and artifact name @@ -307,7 +308,13 @@ async def _train_model( assert pbar is not None and num_sequences is not None pbar.update(1) pbar.set_postfix(event.data) - yield {**event.data, "num_gradient_steps": num_sequences} + metrics = rename_train_metrics( + {k: float(v) for k, v in event.data.items()} + ) + yield { + **metrics, + TRAIN_GRADIENT_STEPS_KEY: float(num_sequences), + } elif event.type == "training_started": num_sequences = event.data["num_sequences"] if pbar is None: @@ -472,7 +479,13 @@ async def _train_sft( assert pbar is not None and num_batches is not None pbar.update(1) pbar.set_postfix(event.data) - yield {**event.data, "num_gradient_steps": num_batches} + metrics = rename_train_metrics( + {k: float(v) for k, v in event.data.items()} + ) + yield { + **metrics, + TRAIN_GRADIENT_STEPS_KEY: float(num_batches), + } elif event.type == "training_started": num_batches = event.data.get("num_sequences", 0) if pbar is None: diff --git a/src/art/tinker/service.py b/src/art/tinker/service.py index ba6768eb..1f5970ac 100644 --- a/src/art/tinker/service.py +++ b/src/art/tinker/service.py @@ -80,7 +80,7 @@ def custom_loss_fn( for mask, lp in zip(masks, logprobs_list): logprobs[mask] = lp loss = loss_fn(inputs, logprobs.unsqueeze(0), None, None, _config) - return loss.mean_policy_loss, {"policy_loss": loss.mean_policy_loss.item()} + return loss.mean_policy_loss, {"loss/train": loss.mean_policy_loss.item()} shifted_tokens = shift_tensor(packed_tensors["tokens"], 0) diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py index e5eb1180..19df73dd 100644 --- a/src/art/tinker_native/backend.py +++ b/src/art/tinker_native/backend.py @@ -30,6 +30,7 @@ from .. import dev from ..backend import Backend from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing +from ..metrics_taxonomy import rename_train_metric_key from ..model import Model, TrainableModel from ..tinker.backend import get_renderer_name from ..tinker.server import get_free_port @@ -217,8 +218,8 @@ async def train( # type: ignore[override] ) metrics: dict[str, float] = { - "num_groups_submitted": float(len(groups_list)), - "num_datums": float(len(datums)), + "train/num_groups_submitted": float(len(groups_list)), + "data/step_num_datums": float(len(datums)), } if not datums: @@ -227,10 +228,12 @@ async def train( # type: ignore[override] train_tokens = 0 for datum in datums: train_tokens += len(datum.model_input.to_ints()) - metrics["train_tokens"] = float(train_tokens) + metrics["data/step_trainer_tokens"] = float(train_tokens) pricing = get_model_pricing(model.base_model) if pricing is not None: - metrics["costs_train"] = compute_train_cost(train_tokens, pricing) + metrics["costs/train/tinker_train"] = compute_train_cost( + train_tokens, pricing + ) if adam_params is None: adam_params = tinker.AdamParams( @@ -268,12 +271,12 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum: for key, value in forward_output.metrics.items(): if value is None: continue - metrics[key] = float(value) + metrics[rename_train_metric_key(key)] = float(value) if optim_output.metrics: for key, value in optim_output.metrics.items(): if value is None: continue - metrics[key] = float(value) + metrics[rename_train_metric_key(key)] = float(value) next_step = state.current_step + 1 checkpoint_name = f"step_{next_step:06d}" diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index 94d01b78..f3a69179 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -844,12 +844,12 @@ async def train_sft( batch_idx += 1 yield { - "loss": batch_loss, - "learning_rate": batch.learning_rate, - "grad_norm": grad_norm, - "num_trajectories": float(batch.num_trajectories), - "num_trainable_tokens": float(batch.num_trainable_tokens), - "tokens_per_second": tokens_per_second, + "loss/train": batch_loss, + "loss/learning_rate": batch.learning_rate, + "loss/grad_norm": grad_norm, + "train/num_trajectories": float(batch.num_trajectories), + "train/num_trainable_tokens": float(batch.num_trainable_tokens), + "throughput/train_tok_per_sec": tokens_per_second, } # === Cleanup === diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py index 34dbc5cd..f095fe35 100644 --- a/src/art/unsloth/train.py +++ b/src/art/unsloth/train.py @@ -12,6 +12,7 @@ from .. import dev from ..loss import loss_fn, shift_tensor +from ..metrics_taxonomy import rename_train_metrics from ..types import TrainConfig if TYPE_CHECKING: @@ -169,14 +170,16 @@ def compute_loss( _config, ) - trainer._metrics["train"]["learning_rate"].append(config.learning_rate) - trainer._metrics["train"]["policy_loss"].append(loss.mean_policy_loss.item()) + trainer._metrics["train"]["loss/learning_rate"].append(config.learning_rate) + trainer._metrics["train"]["loss/train"].append(loss.mean_policy_loss.item()) if loss.mean_entropy is not None: - trainer._metrics["train"]["entropy"].append(loss.mean_entropy.item()) + trainer._metrics["train"]["loss/entropy"].append(loss.mean_entropy.item()) if config.beta > 0.0: - trainer._metrics["train"]["kl_div"].append(loss.mean_kl.item()) + trainer._metrics["train"]["loss/kl_div"].append(loss.mean_kl.item()) if loss.kl_policy_ref is not None: - trainer._metrics["train"]["kl_policy_ref"].append(loss.kl_policy_ref.item()) + trainer._metrics["train"]["loss/kl_policy_ref"].append( + loss.kl_policy_ref.item() + ) return loss.mean_policy_loss + config.beta * loss.mean_kl return compute_loss @@ -195,8 +198,7 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None: if next(iter(logs.keys())).startswith("eval_"): metrics = {f"eval_{key}": val for key, val in metrics.items()} - logs = {**logs, **metrics} - logs.pop("learning_rate", None) + logs = {**rename_train_metrics(logs), **metrics} results_queue.put_nowait(logs) trainer._metrics["train"].clear() diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 4d0415fc..c5feeefc 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -726,9 +726,21 @@ async def test_train_sft_aggregates_metrics(self, tmp_path: Path): async def mock_train_sft(*args, **kwargs): # Simulate 3 batches with different metrics - yield {"loss": 1.0, "learning_rate": 1e-4, "grad_norm": 0.5} - yield {"loss": 0.8, "learning_rate": 1e-4, "grad_norm": 0.4} - yield {"loss": 0.6, "learning_rate": 1e-4, "grad_norm": 0.3} + yield { + "loss/train": 1.0, + "loss/learning_rate": 1e-4, + "loss/grad_norm": 0.5, + } + yield { + "loss/train": 0.8, + "loss/learning_rate": 1e-4, + "loss/grad_norm": 0.4, + } + yield { + "loss/train": 0.6, + "loss/learning_rate": 1e-4, + "loss/grad_norm": 0.3, + } mock_backend._train_sft = mock_train_sft mock_backend._get_step = AsyncMock(return_value=1) # Step after training From 1ba0931603c786b1eeb4ddc458584a546a7c40a9 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 17:42:59 -0800 Subject: [PATCH 09/46] docs: Add metrics taxonomy guide and smoke example --- docs/metrics-taxonomy.md | 58 ++++++++++++++++++++++ examples/metrics_taxonomy_smoke.py | 78 ++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 docs/metrics-taxonomy.md create mode 100644 examples/metrics_taxonomy_smoke.py diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md new file mode 100644 index 00000000..1bbe3373 --- /dev/null +++ b/docs/metrics-taxonomy.md @@ -0,0 +1,58 @@ +# Metrics Taxonomy (Phase 1) + +Phase 1 introduces sectioned metric namespaces and hierarchical cost rollups. + +## Sections + +- `reward/*` +- `loss/*` +- `throughput/*` +- `costs/*` +- `time/*` +- `data/*` +- `train/*`, `val/*`, `test/*` + +## Train Key Mapping + +Current training code emits the following canonical keys: + +- `reward` -> `reward/mean` +- `reward_std_dev` -> `reward/std_dev` +- `exception_rate` -> `reward/exception_rate` +- `group_metric_` -> `reward/group_` +- `policy_loss` / `loss` -> `loss/train` +- `entropy` -> `loss/entropy` +- `kl_div` -> `loss/kl_div` +- `kl_policy_ref` -> `loss/kl_policy_ref` +- `grad_norm` -> `loss/grad_norm` +- `learning_rate` -> `loss/learning_rate` +- `tokens_per_second` -> `throughput/train_tok_per_sec` +- `num_groups_submitted` -> `train/num_groups_submitted` +- `num_groups_trainable` -> `train/num_groups_trainable` +- `num_trajectories` -> `train/num_trajectories` +- `num_trainable_tokens` -> `train/num_trainable_tokens` +- `train_tokens` -> `data/step_trainer_tokens` +- `num_datums` -> `data/step_num_datums` +- `num_gradient_steps` -> `data/step_num_gradient_steps` + +## Cost Rollups + +Cost leaves can be logged with either: + +- hierarchical keys, e.g. `costs/train/llm_judge/correctness` +- legacy component keys, e.g. `costs_prefill`, `costs_sample` + +ART rolls costs up automatically: + +- parent rollups (for example `costs/train`, `costs/all`) +- cumulative keys with `_cum` suffix (for example `costs/all_cum`) + +## End-to-End Smoke Test + +Run: + +```bash +uv run python examples/metrics_taxonomy_smoke.py +``` + +This writes a local history file and, if `WANDB_API_KEY` is set, logs the same metrics to W&B. diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py new file mode 100644 index 00000000..25e9de1c --- /dev/null +++ b/examples/metrics_taxonomy_smoke.py @@ -0,0 +1,78 @@ +import asyncio +import json +import os +from pathlib import Path +import time + +import art + + +async def main() -> None: + project = os.environ.get("ART_METRICS_PROJECT", "metrics-taxonomy-smoke") + model_name = os.environ.get( + "ART_METRICS_MODEL", f"metrics-smoke-{int(time.time())}" + ) + base_path = os.environ.get("ART_METRICS_BASE_PATH", ".art") + + model = art.Model( + name=model_name, + project=project, + base_path=base_path, + report_metrics=["wandb"], + ) + + for step in (1, 2): + trajectories = [ + art.TrajectoryGroup( + trajectories=[ + art.Trajectory( + reward=0.4 + 0.1 * step, + metrics={ + "judge_quality": 0.7 + 0.05 * step, + "reward/custom_prefixed": 0.2 * step, + }, + messages_and_choices=[ + {"role": "user", "content": f"smoke step {step}"}, + {"role": "assistant", "content": "ok"}, + ], + ) + ], + exceptions=[], + ) + ] + + await model.log( + trajectories, + split="train", + step=step, + metrics={ + "loss/train": 1.0 / step, + "loss/grad_norm": 0.5 + 0.1 * step, + "throughput/train_tok_per_sec": 1000.0 + 100.0 * step, + "time/step_wall_s": 1.5 + 0.2 * step, + "data/step_num_scenarios": 2.0, + "data/step_actor_tokens": 120.0 + 10.0 * step, + "costs_prefill": 0.10 * step, + "costs_sample": 0.05 * step, + "costs/train/llm_judge/correctness": 0.02 * step, + }, + ) + + history_path = Path(base_path) / project / "models" / model_name / "history.jsonl" + print(f"Wrote history: {history_path}") + + with open(history_path) as f: + rows = [json.loads(line) for line in f] + + print("\nLast row key excerpts:") + last = rows[-1] + show_prefixes = ("reward/", "loss/", "throughput/", "time/", "data/", "costs/") + for key in sorted(last): + if key.startswith(show_prefixes): + print(f"{key}: {last[key]}") + + print("\nIf WANDB_API_KEY is set, metrics are also logged to W&B.") + + +if __name__ == "__main__": + asyncio.run(main()) From 75068fdf31a0b6a460d34af3903105df501a03e6 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 17:50:55 -0800 Subject: [PATCH 10/46] fix: Bind nested cost metrics to training_step in W&B --- src/art/model.py | 22 ++++++++++++++++++ tests/unit/test_metric_routing.py | 38 +++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/src/art/model.py b/src/art/model.py index afe0073d..d3bec930 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -109,6 +109,7 @@ class Model( _s3_prefix: str | None = None _openai_client: AsyncOpenAI | None = None _wandb_run: Optional["Run"] = None # Private, for lazy wandb initialization + _wandb_defined_metrics: set[str] _run_start_time: float _metrics_builder: MetricsBuilder _metrics_builder_state_loaded: bool @@ -141,6 +142,7 @@ def __init__( report_metrics=report_metrics, **kwargs, ) + object.__setattr__(self, "_wandb_defined_metrics", set()) object.__setattr__(self, "_run_start_time", time.time()) object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train")) object.__setattr__(self, "_metrics_builder_state_loaded", False) @@ -397,6 +399,14 @@ def _get_wandb_run(self) -> Optional["Run"]: ), ) self._wandb_run = run + object.__setattr__( + self, + "_wandb_defined_metrics", + { + "training_step", + "time/wall_clock_sec", + }, + ) # Define training_step as the x-axis for all metrics. # This allows out-of-order logging (e.g., async validation for previous steps). @@ -461,8 +471,20 @@ def _log_metrics( ) or (self.report_metrics is not None and "wandb" in self.report_metrics) if should_log_wandb: if run := self._get_wandb_run(): + self._define_wandb_step_metrics(prefixed.keys()) run.log(prefixed) + def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None: + import wandb + + for key in keys: + if not key.startswith("costs/"): + continue + if key in self._wandb_defined_metrics: + continue + wandb.define_metric(key, step_metric="training_step") + self._wandb_defined_metrics.add(key) + def _extract_non_cost_metrics( self, metrics: dict[str, float], split: str ) -> dict[str, float]: diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py index e83a48ed..2587385d 100644 --- a/tests/unit/test_metric_routing.py +++ b/tests/unit/test_metric_routing.py @@ -74,3 +74,41 @@ def test_get_wandb_run_registers_taxonomy_sections(self, tmp_path: Path) -> None (("val/*",), {"step_metric": "training_step"}), (("test/*",), {"step_metric": "training_step"}), ] + + def test_log_metrics_defines_nested_cost_keys_with_training_step( + self, tmp_path: Path + ) -> None: + fake_run = MagicMock() + fake_run._is_finished = False + + fake_wandb = types.SimpleNamespace() + fake_wandb.init = MagicMock(return_value=fake_run) + fake_wandb.define_metric = MagicMock() + fake_wandb.Settings = lambda **kwargs: kwargs + + with patch.dict(os.environ, {"WANDB_API_KEY": "test-key"}, clear=False): + with patch.dict("sys.modules", {"wandb": fake_wandb}): + model = Model( + name="test-model", + project="test-project", + base_path=str(tmp_path), + report_metrics=["wandb"], + ) + model._log_metrics( + { + "costs/train/sample": 0.1, + "costs/train/prefill_cum": 0.2, + }, + split="train", + step=1, + ) + + define_calls = [ + (call.args, call.kwargs) + for call in fake_wandb.define_metric.call_args_list + ] + assert (("costs/train/sample",), {"step_metric": "training_step"}) in define_calls + assert ( + (("costs/train/prefill_cum",), {"step_metric": "training_step"}) + in define_calls + ) From f958e3ca4d72e7aa65f215ed3780c92ad618611e Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 18:23:53 -0800 Subject: [PATCH 11/46] feat: Add API cost decorator and metrics context wiring --- src/art/metrics.py | 335 +++++++++++++++++++++++++++- src/art/model.py | 11 + src/art/pipeline_trainer/trainer.py | 12 +- 3 files changed, 350 insertions(+), 8 deletions(-) diff --git a/src/art/metrics.py b/src/art/metrics.py index 4ff89f00..623aadcf 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -1,26 +1,164 @@ from __future__ import annotations import asyncio +from collections.abc import Callable +from contextlib import contextmanager from contextvars import ContextVar, Token -from typing import Any +from dataclasses import dataclass +from functools import wraps +from inspect import iscoroutinefunction +from typing import Any, ParamSpec, TypeVar + +from .costs import tokens_to_cost _active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder") _HIERARCHICAL_SECTIONS = {"costs", "time", "data"} +_DEFAULT_PROVIDER = "openai" +_OPENAI_PROVIDER = "openai" +_ANTHROPIC_PROVIDER = "anthropic" + +P = ParamSpec("P") +R = TypeVar("R") + + +CostExtractor = Callable[[Any], float | None] +ResponseGetter = Callable[[Any], Any] + + +@dataclass(frozen=True) +class TokenPricing: + prompt_per_million: float + completion_per_million: float + + +_DEFAULT_TOKEN_PRICING = { + _OPENAI_PROVIDER: TokenPricing(prompt_per_million=2.5, completion_per_million=10.0), + _ANTHROPIC_PROVIDER: TokenPricing( + prompt_per_million=3.0, completion_per_million=15.0 + ), +} + + +@dataclass +class _SharedMetricsState: + lock: asyncio.Lock + step_buffer: dict[str, float] + cum_state: dict[str, float] + unique_scenario_ids: set[str] + cost_extractors: dict[str, CostExtractor] + token_pricing: dict[str, TokenPricing] + + +def _new_shared_metrics_state() -> _SharedMetricsState: + return _SharedMetricsState( + lock=asyncio.Lock(), + step_buffer={}, + cum_state={}, + unique_scenario_ids=set(), + cost_extractors={}, + token_pricing=dict(_DEFAULT_TOKEN_PRICING), + ) + + +def _normalize_provider(provider: str | None) -> str | None: + if provider is None: + return None + normalized = provider.strip().lower() + if not normalized: + return None + return normalized + + +def _read_usage_field(usage: Any, field: str) -> float | None: + if usage is None: + return None + if isinstance(usage, dict): + value = usage.get(field) + else: + value = getattr(usage, field, None) + if value is None: + return None + return float(value) + + +def _response_usage(response: Any) -> Any: + if isinstance(response, dict): + return response.get("usage") + return getattr(response, "usage", None) + + +def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None: + usage = _response_usage(response) + prompt_tokens = _read_usage_field(usage, "prompt_tokens") + completion_tokens = _read_usage_field(usage, "completion_tokens") + if prompt_tokens is None and completion_tokens is None: + return None + return prompt_tokens or 0.0, completion_tokens or 0.0 + + +def _extract_anthropic_token_counts(response: Any) -> tuple[float, float] | None: + usage = _response_usage(response) + input_tokens = _read_usage_field(usage, "input_tokens") + output_tokens = _read_usage_field(usage, "output_tokens") + if input_tokens is None and output_tokens is None: + return None + return input_tokens or 0.0, output_tokens or 0.0 + + +def _detect_provider(response: Any) -> str | None: + usage = _response_usage(response) + if usage is None: + return None + + if ( + _read_usage_field(usage, "prompt_tokens") is not None + or _read_usage_field(usage, "completion_tokens") is not None + ): + return _OPENAI_PROVIDER + if ( + _read_usage_field(usage, "input_tokens") is not None + or _read_usage_field(usage, "output_tokens") is not None + ): + return _ANTHROPIC_PROVIDER + return None + + +def _estimate_cost( + token_counts: tuple[float, float] | None, + pricing: TokenPricing, +) -> float | None: + if token_counts is None: + return None + prompt_tokens, completion_tokens = token_counts + return tokens_to_cost(prompt_tokens, pricing.prompt_per_million) + tokens_to_cost( + completion_tokens, + pricing.completion_per_million, + ) class MetricsBuilder: """Build and accumulate step-level metrics for logging.""" - def __init__(self, cost_context: str) -> None: + def __init__( + self, + cost_context: str, + *, + _shared_state: _SharedMetricsState | None = None, + ) -> None: if not cost_context: raise ValueError("cost_context must be non-empty") self.cost_context = cost_context - self._lock = asyncio.Lock() - self._step_buffer: dict[str, float] = {} - self._cum_state: dict[str, float] = {} - self._unique_scenario_ids: set[str] = set() + self._shared_state = ( + _shared_state if _shared_state is not None else _new_shared_metrics_state() + ) + self._lock = self._shared_state.lock + self._step_buffer = self._shared_state.step_buffer + self._cum_state = self._shared_state.cum_state + self._unique_scenario_ids = self._shared_state.unique_scenario_ids + self._cost_extractors = self._shared_state.cost_extractors + self._token_pricing = self._shared_state.token_pricing def add_cost(self, path: str, usd: float) -> None: if not path: @@ -99,10 +237,52 @@ async def flush(self, step: int) -> dict[str, float]: def activate(self) -> Token["MetricsBuilder"]: return _active_builder.set(self) + @contextmanager + def activate_context(self): + token = self.activate() + try: + yield self + finally: + token.var.reset(token) + @staticmethod def get_active() -> "MetricsBuilder": return _active_builder.get() + def for_cost_context(self, cost_context: str) -> "MetricsBuilder": + normalized_cost_context = cost_context.strip() + if not normalized_cost_context: + raise ValueError("cost_context must be non-empty") + if normalized_cost_context == self.cost_context: + return self + return MetricsBuilder( + cost_context=normalized_cost_context, + _shared_state=self._shared_state, + ) + + def register_cost_extractor( + self, provider: str, extractor: CostExtractor + ) -> None: + normalized_provider = _normalize_provider(provider) + if normalized_provider is None: + raise ValueError("provider must be non-empty") + self._cost_extractors[normalized_provider] = extractor + + def register_token_pricing( + self, + provider: str, + *, + prompt_per_million: float, + completion_per_million: float, + ) -> None: + normalized_provider = _normalize_provider(provider) + if normalized_provider is None: + raise ValueError("provider must be non-empty") + self._token_pricing[normalized_provider] = TokenPricing( + prompt_per_million=float(prompt_per_million), + completion_per_million=float(completion_per_million), + ) + def state_dict(self) -> dict[str, Any]: return { "cum_state": dict(self._cum_state), @@ -172,3 +352,146 @@ def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]: rollups["costs/all"] = costs_all return rollups + + def _resolve_token_pricing( + self, + provider: str | None, + *, + prompt_price_per_million: float | None = None, + completion_price_per_million: float | None = None, + ) -> TokenPricing: + normalized_provider = _normalize_provider(provider) or _DEFAULT_PROVIDER + default_pricing = self._token_pricing.get( + normalized_provider, + self._token_pricing[_DEFAULT_PROVIDER], + ) + return TokenPricing( + prompt_per_million=( + float(prompt_price_per_million) + if prompt_price_per_million is not None + else default_pricing.prompt_per_million + ), + completion_per_million=( + float(completion_price_per_million) + if completion_price_per_million is not None + else default_pricing.completion_per_million + ), + ) + + def _extract_api_cost( + self, + response: Any, + *, + provider: str | None = None, + prompt_price_per_million: float | None = None, + completion_price_per_million: float | None = None, + ) -> float | None: + provider_name = _normalize_provider(provider) or _detect_provider(response) + if provider_name is not None: + custom_extractor = self._cost_extractors.get(provider_name) + if custom_extractor is not None: + custom_cost = custom_extractor(response) + if custom_cost is not None: + return float(custom_cost) + + token_pricing = self._resolve_token_pricing( + provider_name, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + ) + if provider_name == _OPENAI_PROVIDER: + return _estimate_cost( + _extract_openai_token_counts(response), + token_pricing, + ) + if provider_name == _ANTHROPIC_PROVIDER: + return _estimate_cost( + _extract_anthropic_token_counts(response), + token_pricing, + ) + + token_pricing = self._resolve_token_pricing( + provider_name, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + ) + token_counts = _extract_openai_token_counts(response) + if token_counts is None: + token_counts = _extract_anthropic_token_counts(response) + return _estimate_cost(token_counts, token_pricing) + + +def _record_api_cost( + *, + result: Any, + source: str, + provider: str | None, + response_getter: ResponseGetter | None, + prompt_price_per_million: float | None, + completion_price_per_million: float | None, +) -> None: + try: + builder = MetricsBuilder.get_active() + except LookupError: + return + + response = response_getter(result) if response_getter is not None else result + cost = builder._extract_api_cost( + response, + provider=provider, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + ) + if cost is None: + return + builder.add_cost(f"{builder.cost_context}/{source}", cost) + + +def track_api_cost( + *, + source: str, + provider: str | None = None, + response_getter: ResponseGetter | None = None, + prompt_price_per_million: float | None = None, + completion_price_per_million: float | None = None, +) -> Callable[[Callable[P, R]], Callable[P, R]]: + normalized_source = source.strip("/") + if not normalized_source: + raise ValueError("source must be non-empty") + + normalized_provider = _normalize_provider(provider) + + def _decorate(func: Callable[P, R]) -> Callable[P, R]: + if iscoroutinefunction(func): + + @wraps(func) + async def _async_wrapper(*args: P.args, **kwargs: P.kwargs): + result = await func(*args, **kwargs) + _record_api_cost( + result=result, + source=normalized_source, + provider=normalized_provider, + response_getter=response_getter, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + ) + return result + + return _async_wrapper + + @wraps(func) + def _sync_wrapper(*args: P.args, **kwargs: P.kwargs): + result = func(*args, **kwargs) + _record_api_cost( + result=result, + source=normalized_source, + provider=normalized_provider, + response_getter=response_getter, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + ) + return result + + return _sync_wrapper + + return _decorate diff --git a/src/art/model.py b/src/art/model.py index d3bec930..91f9e81b 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -1,4 +1,5 @@ import asyncio +from contextvars import Token from datetime import datetime import json import os @@ -510,6 +511,16 @@ def _extract_non_cost_metrics( non_cost_metrics[metric] = numeric_value return non_cost_metrics + def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder: + if cost_context is None: + return self._metrics_builder + return self._metrics_builder.for_cost_context(cost_context) + + def activate_metrics_context( + self, cost_context: str + ) -> Token[MetricsBuilder]: + return self.metrics_builder(cost_context).activate() + def _load_metrics_builder_state(self) -> None: if self._metrics_builder_state_loaded: return diff --git a/src/art/pipeline_trainer/trainer.py b/src/art/pipeline_trainer/trainer.py index a061636b..9dcec1cd 100644 --- a/src/art/pipeline_trainer/trainer.py +++ b/src/art/pipeline_trainer/trainer.py @@ -328,7 +328,11 @@ async def _rollout_worker(self, worker_id: int) -> None: initial_version = self.state.policy_version - group = await self.rollout_fn(self.model, scenario, self.config) + token = self.model.activate_metrics_context("train") + try: + group = await self.rollout_fn(self.model, scenario, self.config) + finally: + token.var.reset(token) if not isinstance(group, TrajectoryGroup): errored = True continue @@ -562,7 +566,11 @@ async def _run_eval(self, step: int) -> None: self._status.note_val_started(step) reward: float | None = None try: - result = await self.eval_fn(self.model, step, self.config) + token = self.model.activate_metrics_context("eval") + try: + result = await self.eval_fn(self.model, step, self.config) + finally: + token.var.reset(token) splits: dict[str, list[art.Trajectory | art.TrajectoryGroup]] if isinstance(result, dict): splits = result From 7294638bbe7b7bd3f27bffd6db686a75ee192509 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 18:23:57 -0800 Subject: [PATCH 12/46] test: Add coverage for API cost decorator and context routing --- tests/unit/test_track_api_cost.py | 310 ++++++++++++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 tests/unit/test_track_api_cost.py diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py new file mode 100644 index 00000000..4cb6dd34 --- /dev/null +++ b/tests/unit/test_track_api_cost.py @@ -0,0 +1,310 @@ +import asyncio +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from art import Model, TrainableModel, Trajectory, TrajectoryGroup +from art.metrics import MetricsBuilder, track_api_cost +from art.pipeline_trainer.trainer import PipelineTrainer + + +class _OpenAIUsage: + def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: + self.prompt_tokens = prompt_tokens + self.completion_tokens = completion_tokens + + +class _OpenAIResponse: + def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: + self.usage = _OpenAIUsage(prompt_tokens, completion_tokens) + + +class _AnthropicUsage: + def __init__(self, input_tokens: int, output_tokens: int) -> None: + self.input_tokens = input_tokens + self.output_tokens = output_tokens + + +class _AnthropicResponse: + def __init__(self, input_tokens: int, output_tokens: int) -> None: + self.usage = _AnthropicUsage(input_tokens, output_tokens) + + +class TestTrackApiCost: + @pytest.mark.asyncio + async def test_openai_cost_extraction_with_explicit_pricing(self) -> None: + builder = MetricsBuilder(cost_context="train") + + @track_api_cost( + source="llm_judge/correctness", + provider="openai", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=100, completion_tokens=50) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush(step=1) + assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002) + + @pytest.mark.asyncio + async def test_anthropic_cost_extraction_uses_registered_pricing(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.register_token_pricing( + "anthropic", + prompt_per_million=5.0, + completion_per_million=7.0, + ) + + @track_api_cost(source="llm_judge/faithfulness") + async def _judge() -> _AnthropicResponse: + return _AnthropicResponse(input_tokens=40, output_tokens=60) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush(step=1) + assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062) + + @pytest.mark.asyncio + async def test_custom_extractor_takes_precedence(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.register_cost_extractor("openai", lambda _response: 0.75) + + @track_api_cost( + source="llm_judge/custom", + provider="openai", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=1, completion_tokens=1) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush(step=1) + assert metrics["costs/train/llm_judge/custom"] == pytest.approx(0.75) + + @pytest.mark.asyncio + async def test_decorator_noops_without_active_builder(self) -> None: + @track_api_cost(source="llm_judge/no_context", provider="openai") + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=10, completion_tokens=20) + + result = await _judge() + assert isinstance(result, _OpenAIResponse) + + @pytest.mark.asyncio + async def test_for_cost_context_routes_to_eval_and_shares_state(self) -> None: + builder = MetricsBuilder(cost_context="train") + eval_builder = builder.for_cost_context("eval") + + @track_api_cost( + source="llm_judge/correctness", + provider="openai", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=100, completion_tokens=50) + + token = eval_builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush(step=1) + assert metrics["costs/eval/llm_judge/correctness"] == pytest.approx(0.0002) + + +class TestTrackApiCostIntegration: + @pytest.mark.asyncio + async def test_model_log_emits_train_and_eval_costs(self, tmp_path: Path) -> None: + model = Model( + name="metrics-cost-test", + project="metrics-cost-test", + base_path=str(tmp_path), + report_metrics=[], + ) + + @track_api_cost( + source="llm_judge/correctness", + provider="openai", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _train_judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=100, completion_tokens=50) + + @track_api_cost( + source="llm_judge/factuality", + provider="anthropic", + prompt_price_per_million=3.0, + completion_price_per_million=4.0, + ) + async def _eval_judge() -> _AnthropicResponse: + return _AnthropicResponse(input_tokens=40, output_tokens=10) + + train_token = model.activate_metrics_context("train") + try: + await _train_judge() + finally: + train_token.var.reset(train_token) + + await model.log(trajectories=None, split="train", step=1, metrics={}) + + eval_token = model.activate_metrics_context("eval") + try: + await _eval_judge() + finally: + eval_token.var.reset(eval_token) + + await model.log(trajectories=None, split="val", step=2, metrics={}) + + history_path = ( + tmp_path + / "metrics-cost-test" + / "models" + / "metrics-cost-test" + / "history.jsonl" + ) + with open(history_path) as f: + first = json.loads(f.readline()) + second = json.loads(f.readline()) + + assert first["costs/train/llm_judge/correctness"] == pytest.approx(0.0002) + assert second["costs/eval/llm_judge/factuality"] == pytest.approx(0.00016) + assert second["costs/all_cum"] == pytest.approx(0.00036) + + @pytest.mark.asyncio + async def test_pipeline_trainer_activates_train_context_for_rollouts( + self, tmp_path: Path + ) -> None: + model = TrainableModel( + name="pipeline-context-test", + project="pipeline-context-test", + base_model="test-model", + base_path=str(tmp_path), + report_metrics=[], + ) + backend = MagicMock() + observed_contexts: list[str] = [] + + async def rollout_fn( + _model: TrainableModel, + _scenario: dict, + _config: dict, + ) -> TrajectoryGroup: + observed_contexts.append(MetricsBuilder.get_active().cost_context) + return TrajectoryGroup( + [ + Trajectory( + reward=1.0, + messages_and_choices=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ], + ) + ] + ) + + trainer = PipelineTrainer( + model=model, + backend=backend, + rollout_fn=rollout_fn, + scenarios=[{"metadata": {"scenario_id": "s1"}}], + config={}, + num_rollout_workers=1, + min_batch_size=1, + max_batch_size=1, + eval_fn=None, + ) + trainer._output_queue = asyncio.Queue() + + await trainer._rollout_worker(worker_id=0) + + assert observed_contexts == ["train"] + + @pytest.mark.asyncio + async def test_pipeline_trainer_activates_eval_context_for_eval_fn( + self, tmp_path: Path + ) -> None: + model = TrainableModel( + name="pipeline-eval-context-test", + project="pipeline-eval-context-test", + base_model="test-model", + base_path=str(tmp_path), + report_metrics=[], + ) + backend = MagicMock() + observed_contexts: list[str] = [] + + @track_api_cost( + source="llm_judge/correctness", + provider="openai", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _judge_call() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=100, completion_tokens=50) + + async def eval_fn( + _model: TrainableModel, + _step: int, + _config: dict, + ) -> list[Trajectory]: + observed_contexts.append(MetricsBuilder.get_active().cost_context) + await _judge_call() + return [ + Trajectory( + reward=1.0, + messages_and_choices=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ], + ) + ] + + trainer = PipelineTrainer( + model=model, + backend=backend, + rollout_fn=lambda *_args, **_kwargs: asyncio.sleep(0), + scenarios=[], + config={}, + num_rollout_workers=1, + min_batch_size=1, + max_batch_size=1, + eval_fn=eval_fn, + ) + + await trainer._run_eval(step=1) + + assert observed_contexts == ["eval"] + + history_path = ( + tmp_path + / "pipeline-eval-context-test" + / "models" + / "pipeline-eval-context-test" + / "history.jsonl" + ) + with open(history_path) as f: + rows = [json.loads(line) for line in f if line.strip()] + + assert any("costs/eval/llm_judge/correctness" in row for row in rows) From c91cf271f111825f991b57f85033d9f7eab639a1 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 18:24:01 -0800 Subject: [PATCH 13/46] docs: Add API cost decorator guide and smoke demo --- docs/metrics-taxonomy.md | 57 +++++++++++++++++++++++++++++- examples/metrics_taxonomy_smoke.py | 34 ++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md index 1bbe3373..b5a2294c 100644 --- a/docs/metrics-taxonomy.md +++ b/docs/metrics-taxonomy.md @@ -1,4 +1,4 @@ -# Metrics Taxonomy (Phase 1) +# Metrics Taxonomy (Phase 1-3) Phase 1 introduces sectioned metric namespaces and hierarchical cost rollups. @@ -56,3 +56,58 @@ uv run python examples/metrics_taxonomy_smoke.py ``` This writes a local history file and, if `WANDB_API_KEY` is set, logs the same metrics to W&B. + +## API Cost Decorator (Phase 2/3) + +Use `@track_api_cost` to automatically write judge/API spend into `costs/{train|eval}/...`. + +```python +from art.metrics import track_api_cost + +@track_api_cost( + source="llm_judge/correctness", + provider="openai", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, +) +async def run_judge(client, messages): + return await client.chat.completions.create( + model="gpt-4o-mini", + messages=messages, + ) +``` + +Activate metric cost context while running train/eval logic: + +```python +train_token = model.activate_metrics_context("train") +try: + await run_judge(client, train_messages) +finally: + train_token.var.reset(train_token) + +eval_token = model.activate_metrics_context("eval") +try: + await run_judge(client, eval_messages) +finally: + eval_token.var.reset(eval_token) +``` + +The next `model.log(...)` flush for that step will include: + +- `costs/train/llm_judge/correctness` (or `costs/eval/...`) +- hierarchical rollups like `costs/train`, `costs/all` +- cumulative keys like `costs/all_cum` + +Built-in providers: + +- OpenAI usage (`prompt_tokens`, `completion_tokens`) +- Anthropic usage (`input_tokens`, `output_tokens`) + +You can override pricing per decorator call or configure builder-level defaults: + +```python +builder = model.metrics_builder() +builder.register_token_pricing("openai", prompt_per_million=1.2, completion_per_million=4.8) +builder.register_cost_extractor("openai", lambda response: 0.001) # optional custom extractor +``` diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py index 25e9de1c..28941b1f 100644 --- a/examples/metrics_taxonomy_smoke.py +++ b/examples/metrics_taxonomy_smoke.py @@ -5,6 +5,34 @@ import time import art +from art.metrics import track_api_cost + + +class _Usage: + def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: + self.prompt_tokens = prompt_tokens + self.completion_tokens = completion_tokens + + +class _Response: + def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: + self.usage = _Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + + +@track_api_cost( + source="llm_judge/decorator_demo", + provider="openai", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, +) +async def _mock_judge_call(step: int) -> _Response: + return _Response( + prompt_tokens=50 * step, + completion_tokens=20 * step, + ) async def main() -> None: @@ -22,6 +50,12 @@ async def main() -> None: ) for step in (1, 2): + train_token = model.activate_metrics_context("train") + try: + await _mock_judge_call(step) + finally: + train_token.var.reset(train_token) + trajectories = [ art.TrajectoryGroup( trajectories=[ From 6fb0d8cc56d2f137a1dac9657670cfaab85e7966 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Wed, 4 Mar 2026 18:25:10 -0800 Subject: [PATCH 14/46] fix: Parse entity and project in metrics smoke config --- examples/metrics_taxonomy_smoke.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py index 28941b1f..4f2c4a2f 100644 --- a/examples/metrics_taxonomy_smoke.py +++ b/examples/metrics_taxonomy_smoke.py @@ -36,7 +36,15 @@ async def _mock_judge_call(step: int) -> _Response: async def main() -> None: - project = os.environ.get("ART_METRICS_PROJECT", "metrics-taxonomy-smoke") + project_spec = os.environ.get("ART_METRICS_PROJECT", "metrics-taxonomy-smoke") + entity = os.environ.get("ART_METRICS_ENTITY") + project = project_spec + if entity is None and "/" in project_spec: + split_entity, split_project = project_spec.split("/", 1) + if split_entity and split_project: + entity = split_entity + project = split_project + model_name = os.environ.get( "ART_METRICS_MODEL", f"metrics-smoke-{int(time.time())}" ) @@ -45,6 +53,7 @@ async def main() -> None: model = art.Model( name=model_name, project=project, + entity=entity, base_path=base_path, report_metrics=["wandb"], ) From 754ef57c319f248ff1acf82c73b97f501f2f4fab Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 10:36:04 -0700 Subject: [PATCH 15/46] test: Cover metrics builder resume and cumulative routing --- tests/unit/test_frontend_logging.py | 30 +++++++++++++++++++++++ tests/unit/test_metrics_builder.py | 37 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index c5feeefc..1ece42e6 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -588,6 +588,36 @@ async def test_cost_cumulative_persists_across_model_recreation( assert second["costs/train/prefill_cum"] == pytest.approx(1.0) assert second["costs/all_cum"] == pytest.approx(1.0) + @pytest.mark.asyncio + async def test_direct_time_and_data_metrics_get_cumulative_variants( + self, tmp_path: Path + ): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + await model.log( + trajectories=None, + split="train", + step=1, + metrics={ + "time/step_actor_s": 1.5, + "data/step_actor_tokens": 10, + }, + ) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["time/step_actor_s"] == pytest.approx(1.5) + assert entry["time/step_actor_s_cum"] == pytest.approx(1.5) + assert entry["data/step_actor_tokens"] == pytest.approx(10) + assert entry["data/step_actor_tokens_cum"] == pytest.approx(10) + class TestWandbIntegration: """Test wandb integration logic (without mocking wandb itself).""" diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py index 7b56c150..db083242 100644 --- a/tests/unit/test_metrics_builder.py +++ b/tests/unit/test_metrics_builder.py @@ -57,6 +57,27 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None: assert second["data/step_actor_tokens_cum"] == pytest.approx(15) assert second["data/cum_num_unique_scenarios"] == 3 + @pytest.mark.asyncio + async def test_helper_metrics_accumulate_within_a_single_step(self) -> None: + builder = MetricsBuilder(cost_context="train") + + builder.add_data(step_num_scenarios=2, step_actor_tokens=10) + builder.add_data(step_num_scenarios=3, step_actor_tokens=5) + builder.add_user_timing(step_wall_s=1.5, step_actor_s=0.3, step_eval_s=0.2) + builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2, step_eval_s=0.1) + builder.add_idle_times(step_trainer_idle_s=1.0, step_actor_idle_s=2.0) + builder.add_idle_times(step_trainer_idle_s=0.5, step_actor_idle_s=1.0) + + metrics = await builder.flush(step=1) + + assert metrics["data/step_num_scenarios"] == pytest.approx(5) + assert metrics["data/step_actor_tokens"] == pytest.approx(15) + assert metrics["time/step_wall_s"] == pytest.approx(2.0) + assert metrics["time/step_actor_s"] == pytest.approx(0.5) + assert metrics["time/step_eval_s"] == pytest.approx(0.3) + assert metrics["throughput/step_trainer_idle_s"] == pytest.approx(1.5) + assert metrics["throughput/step_actor_idle_s"] == pytest.approx(3.0) + @pytest.mark.asyncio async def test_costs_all_generated_for_single_and_multiple_children(self) -> None: single = MetricsBuilder(cost_context="train") @@ -128,6 +149,22 @@ async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None: assert metrics["costs/train_cum"] == pytest.approx(3.0) assert metrics["costs/all_cum"] == pytest.approx(3.0) + @pytest.mark.asyncio + async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None: + before = MetricsBuilder(cost_context="train") + before.add_cost("train/gpu", usd=1.0) + await before.flush(step=1) + + after = MetricsBuilder(cost_context="train") + after.load_state_dict(before.state_dict()) + + eval_builder = after.for_cost_context("eval") + eval_builder.add_cost("eval/judge", usd=2.0) + + metrics = await eval_builder.flush(step=2) + assert metrics["costs/eval/judge"] == pytest.approx(2.0) + assert metrics["costs/all_cum"] == pytest.approx(3.0) + @pytest.mark.asyncio async def test_unique_scenario_count_tracks_exact_ids(self) -> None: builder = MetricsBuilder(cost_context="train") From 4659a5ba4128feb639dc14548ce34d10fcf6c582 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 10:37:25 -0700 Subject: [PATCH 16/46] fix: Restore MetricsBuilder cumulative state and routing --- src/art/metrics.py | 38 +++++++++++++++++++++++++++----------- src/art/model.py | 4 ++++ 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/src/art/metrics.py b/src/art/metrics.py index 623aadcf..ea299427 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -164,7 +164,12 @@ def add_cost(self, path: str, usd: float) -> None: if not path: raise ValueError("Cost path must be non-empty") full_key = f"costs/{path}" - self._validate_and_add(full_key, float(usd)) + self.add_metric(full_key, float(usd)) + + def add_metric(self, key: str, value: float) -> None: + if "/" not in key: + raise ValueError("Metric key must include a section prefix") + self._validate_and_add(key, float(value)) def add_data( self, @@ -173,9 +178,9 @@ def add_data( scenario_ids: list[str] | None = None, ) -> None: if step_num_scenarios is not None: - self._step_buffer["data/step_num_scenarios"] = float(step_num_scenarios) + self.add_metric("data/step_num_scenarios", float(step_num_scenarios)) if step_actor_tokens is not None: - self._step_buffer["data/step_actor_tokens"] = float(step_actor_tokens) + self.add_metric("data/step_actor_tokens", float(step_actor_tokens)) if scenario_ids is not None: self._unique_scenario_ids.update(scenario_ids) @@ -186,11 +191,11 @@ def add_user_timing( step_eval_s: float | None = None, ) -> None: if step_wall_s is not None: - self._step_buffer["time/step_wall_s"] = float(step_wall_s) + self.add_metric("time/step_wall_s", float(step_wall_s)) if step_actor_s is not None: - self._step_buffer["time/step_actor_s"] = float(step_actor_s) + self.add_metric("time/step_actor_s", float(step_actor_s)) if step_eval_s is not None: - self._step_buffer["time/step_eval_s"] = float(step_eval_s) + self.add_metric("time/step_eval_s", float(step_eval_s)) def add_idle_times( self, @@ -198,11 +203,12 @@ def add_idle_times( step_actor_idle_s: float | None = None, ) -> None: if step_trainer_idle_s is not None: - self._step_buffer["throughput/step_trainer_idle_s"] = float( - step_trainer_idle_s + self.add_metric( + "throughput/step_trainer_idle_s", + float(step_trainer_idle_s), ) if step_actor_idle_s is not None: - self._step_buffer["throughput/step_actor_idle_s"] = float(step_actor_idle_s) + self.add_metric("throughput/step_actor_idle_s", float(step_actor_idle_s)) async def flush(self, step: int) -> dict[str, float]: del step @@ -292,8 +298,18 @@ def state_dict(self) -> dict[str, Any]: def load_state_dict(self, state: dict[str, Any]) -> None: raw_cum_state = state.get("cum_state", {}) raw_unique_ids = state.get("unique_scenario_ids", []) - self._cum_state = {str(k): float(v) for k, v in raw_cum_state.items()} - self._unique_scenario_ids = {str(v) for v in raw_unique_ids} + restored_cum_state = {str(k): float(v) for k, v in raw_cum_state.items()} + restored_unique_ids = {str(v) for v in raw_unique_ids} + + self._shared_state.cum_state.clear() + self._shared_state.cum_state.update(restored_cum_state) + self._shared_state.unique_scenario_ids.clear() + self._shared_state.unique_scenario_ids.update(restored_unique_ids) + + # Keep local references aligned with the shared state so derived builders + # created before or after resume observe the same cumulative state. + self._cum_state = self._shared_state.cum_state + self._unique_scenario_ids = self._shared_state.unique_scenario_ids def _validate_and_add(self, key: str, value: float) -> None: if key.endswith("_cum"): diff --git a/src/art/model.py b/src/art/model.py index 91f9e81b..f23cad63 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -33,6 +33,7 @@ COSTS_METRIC_PREFIX = "costs_" COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total" METRICS_BUILDER_STATE_KEY = "_metrics_builder_state" +BUILDER_CUMULATIVE_PREFIXES = ("time/step_", "data/step_") METRIC_SECTIONS = frozenset( { "reward", @@ -508,6 +509,9 @@ def _extract_non_cost_metrics( f"{cost_context}/{component}", numeric_value ) continue + if metric.startswith(BUILDER_CUMULATIVE_PREFIXES): + self._metrics_builder.add_metric(metric, numeric_value) + continue non_cost_metrics[metric] = numeric_value return non_cost_metrics From 26c7406467510f4d4a08393bb573d99276eb7e75 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 10:56:07 -0700 Subject: [PATCH 17/46] test: Cover taxonomy timing and data metrics --- tests/unit/test_frontend_logging.py | 82 +++++++++++++++++++++++++---- tests/unit/test_metrics_builder.py | 17 ++++++ tests/unit/test_track_api_cost.py | 1 + 3 files changed, 90 insertions(+), 10 deletions(-) diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 1ece42e6..3230af03 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -225,8 +225,9 @@ async def test_history_appends_entries( history_path = tmp_path / "test-project/models/test-model/history.jsonl" df = pl.read_ndjson(str(history_path)) - # Should have 2 entries - assert len(df) == 2 + # Each log call now emits the primary metrics row plus a taxonomy + # row for cumulative data/time metrics. + assert len(df) == 4 # Check both splits are present columns = df.columns @@ -506,6 +507,62 @@ async def test_train_trajectory_metrics_default_to_reward_prefix( assert entry["reward/custom_score"] == 1.0 assert entry["reward/prefixed"] == 2.0 + @pytest.mark.asyncio + async def test_train_logs_add_default_data_metrics_from_trajectory_groups( + self, tmp_path: Path + ): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + trajectories = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.8, + messages_and_choices=[{"role": "user", "content": "a"}], + ), + Trajectory( + reward=0.2, + messages_and_choices=[{"role": "user", "content": "b"}], + ), + ], + metadata={"scenario_scenario_id": "scenario-1"}, + ), + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.5, + messages_and_choices=[{"role": "user", "content": "c"}], + ) + ], + exceptions=[], + metadata={"scenario_scenario_id": "scenario-2"}, + ), + ] + + await model.log(trajectories, split="train", step=1) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + rows = [json.loads(line) for line in f if line.strip()] + + merged: dict[str, float] = {} + for row in rows: + merged.update(row) + + assert merged["data/step_num_scenarios"] == pytest.approx(2.0) + assert merged["data/step_num_trajectories"] == pytest.approx(3.0) + assert merged["data/step_num_groups_submitted"] == pytest.approx(2.0) + assert merged["data/step_num_groups_trainable"] == pytest.approx(1.0) + assert merged["data/cum_num_unique_scenarios"] == pytest.approx(2.0) + assert merged["train/num_groups_submitted"] == pytest.approx(2.0) + assert merged["train/num_groups_trainable"] == pytest.approx(1.0) + assert merged["train/num_trajectories"] == pytest.approx(3.0) + @pytest.mark.asyncio async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path): model = Model( @@ -798,13 +855,18 @@ async def mock_train_sft(*args, **kwargs): with open(history_path) as f: lines = f.readlines() - assert len(lines) == 1, f"Expected 1 log entry, got {len(lines)}" + assert len(lines) == 2, f"Expected 2 log entries, got {len(lines)}" + + entries = [json.loads(line) for line in lines] + merged: dict[str, float] = {} + for entry in entries: + merged.update(entry) - # Verify metrics are aggregated (averaged) - entry = json.loads(lines[0]) - assert entry["step"] == 1 - assert entry["loss/train"] == pytest.approx(0.8) # (1.0 + 0.8 + 0.6) / 3 - assert entry["loss/grad_norm"] == pytest.approx(0.4) # (0.5 + 0.4 + 0.3) / 3 + assert all(entry["step"] == 1 for entry in entries) + assert merged["loss/train"] == pytest.approx(0.8) # (1.0 + 0.8 + 0.6) / 3 + assert merged["loss/grad_norm"] == pytest.approx(0.4) # (0.5 + 0.4 + 0.3) / 3 + assert merged["time/step_trainer_s"] >= 0 + assert merged["time/step_trainer_s_cum"] >= 0 @pytest.mark.asyncio async def test_train_sft_single_step_increment(self, tmp_path: Path): @@ -841,8 +903,8 @@ async def mock_train_sft(*args, **kwargs): history_path = tmp_path / "test-project/models/test-sft-step/history.jsonl" df = pl.read_ndjson(str(history_path)) - assert len(df) == 1, "Should have exactly 1 log entry" - assert df["step"][0] == 1, "Step should be 1 (single increment)" + assert len(df) == 2, "Should have exactly 2 log entries" + assert set(df["step"].to_list()) == {1}, "Step should be 1 (single increment)" @pytest.mark.asyncio async def test_train_sft_no_metrics_when_empty(self, tmp_path: Path): diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py index db083242..6b032c05 100644 --- a/tests/unit/test_metrics_builder.py +++ b/tests/unit/test_metrics_builder.py @@ -78,6 +78,23 @@ async def test_helper_metrics_accumulate_within_a_single_step(self) -> None: assert metrics["throughput/step_trainer_idle_s"] == pytest.approx(1.5) assert metrics["throughput/step_actor_idle_s"] == pytest.approx(3.0) + @pytest.mark.asyncio + async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) -> None: + builder = MetricsBuilder(cost_context="train") + + builder.add_metric("time/step_trainer_s", 4.0) + builder.add_metric("data/step_trainer_tokens", 40.0) + builder.add_metric("time/step_actor_s", 2.0) + builder.add_metric("data/step_actor_tokens", 10.0) + builder.add_idle_times(step_trainer_idle_s=1.5, step_actor_idle_s=0.5) + + metrics = await builder.flush(step=1) + + assert metrics["throughput/cum_trainer_idle_s"] == pytest.approx(1.5) + assert metrics["throughput/cum_actor_idle_s"] == pytest.approx(0.5) + assert metrics["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) + assert metrics["throughput/avg_actor_tok_per_s"] == pytest.approx(5.0) + @pytest.mark.asyncio async def test_costs_all_generated_for_single_and_multiple_children(self) -> None: single = MetricsBuilder(cost_context="train") diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py index 4cb6dd34..09915d12 100644 --- a/tests/unit/test_track_api_cost.py +++ b/tests/unit/test_track_api_cost.py @@ -308,3 +308,4 @@ async def eval_fn( rows = [json.loads(line) for line in f if line.strip()] assert any("costs/eval/llm_judge/correctness" in row for row in rows) + assert any("time/step_eval_s" in row for row in rows) From 02a3c58e897996ba6206c695016202ea11e42b26 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 10:56:10 -0700 Subject: [PATCH 18/46] feat: Emit time and data metrics across training flows --- src/art/local/backend.py | 67 ++++++++++++++----- src/art/metrics.py | 38 +++++++++++ src/art/metrics_taxonomy.py | 100 ++++++++++++++++++++++++++++ src/art/model.py | 58 +++++++++++++++- src/art/pipeline_trainer/trainer.py | 75 ++++++++++++++++++--- src/art/serverless/backend.py | 34 +++++++++- src/art/tinker_native/backend.py | 13 +++- 7 files changed, 355 insertions(+), 30 deletions(-) diff --git a/src/art/local/backend.py b/src/art/local/backend.py index aecef80a..6ff4a53a 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -6,6 +6,7 @@ import shutil import socket import subprocess +import time from types import TracebackType from typing import AsyncIterator, Iterable, Literal, cast import warnings @@ -39,7 +40,13 @@ from .. import dev from ..backend import AnyTrainableModel, Backend -from ..metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY, rename_train_metrics +from ..metrics_taxonomy import ( + TRAIN_GRADIENT_STEPS_KEY, + build_data_metrics_from_summary, + build_train_metrics_from_summary, + rename_train_metrics, + summarize_trajectory_groups, +) from ..model import Model, TrainableModel from ..preprocessing.pack import ( PackedTensors, @@ -568,6 +575,7 @@ async def train( # type: ignore[override] # Collect metrics from training training_metrics: list[dict[str, float]] = [] + trainer_started = time.monotonic() async for metrics in self._train_model( model, groups_list, config, dev_config, verbose ): @@ -582,6 +590,22 @@ async def train( # type: ignore[override] for k in {k for d in training_metrics for k in d} if k != TRAIN_GRADIENT_STEPS_KEY } + summary = summarize_trajectory_groups(groups_list) + avg_metrics.setdefault( + "time/step_trainer_s", time.monotonic() - trainer_started + ) + avg_metrics.update( + { + key: value + for key, value in { + **build_data_metrics_from_summary( + summary, include_trainable_groups=True + ), + **build_train_metrics_from_summary(summary), + }.items() + if key not in avg_metrics + } + ) # Get step and checkpoint path step = await self._get_step(model) @@ -619,13 +643,11 @@ async def _train_model( if verbose: print("Packing tensors...") - # Count submitted groups and trainable groups - num_groups_submitted = len(trajectory_groups) - num_groups_trainable = sum( - 1 - for group in trajectory_groups - if group and len(set(trajectory.reward for trajectory in group)) > 1 - ) + summary = summarize_trajectory_groups(trajectory_groups) + base_metrics = { + **build_data_metrics_from_summary(summary, include_trainable_groups=True), + **build_train_metrics_from_summary(summary), + } packed_tensors = self._get_packed_tensors( model, @@ -687,16 +709,20 @@ async def _train_model( # Yield metrics showing no groups were trainable # (the frontend will handle logging) yield { - "train/num_groups_submitted": float(num_groups_submitted), + **base_metrics, + "data/step_num_groups_trainable": 0.0, "train/num_groups_trainable": 0.0, + "data/step_trainer_tokens": 0.0, TRAIN_GRADIENT_STEPS_KEY: 0.0, } return + base_metrics["data/step_trainer_tokens"] = float( + packed_tensors["assistant_mask"].sum().item() + ) disk_packed_tensors = packed_tensors_to_dir( packed_tensors, f"{get_model_dir(model=model, art_path=self._path)}/tensors" ) # Note: scale_learning_rate_by_reward_std_dev is now handled by the frontend (Model.train()) - results: list[dict[str, float]] = [] estimated_gradient_steps = disk_packed_tensors["num_sequences"] pbar = tqdm.tqdm(total=estimated_gradient_steps, desc="train") async for result in service.train( @@ -709,8 +735,11 @@ async def _train_model( assert num_gradient_steps == estimated_gradient_steps, ( f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}" ) - results.append(result) - yield {**result, TRAIN_GRADIENT_STEPS_KEY: float(num_gradient_steps)} + yield { + **base_metrics, + **result, + TRAIN_GRADIENT_STEPS_KEY: float(num_gradient_steps), + } pbar.update(1) pbar.set_postfix(result) pbar.close() @@ -797,15 +826,21 @@ async def _train_sft( service = await self._get_service(model) pbar = tqdm.tqdm(total=len(batches), desc="sft train") - total_trainable_tokens = 0 + total_trainable_tokens = sum(batch.num_trainable_tokens for batch in batches) + total_trajectories = len(trajectory_list) batch_count = 0 async for result in service.train_sft(batches, verbose): pbar.update(1) - pbar.set_postfix({"loss": f"{result.get('loss', 0):.4f}"}) - total_trainable_tokens += result.get("num_trainable_tokens", 0) + pbar.set_postfix({"loss": f"{result.get('loss/train', 0):.4f}"}) batch_count += 1 - yield result + yield { + **result, + "data/step_num_trajectories": float(total_trajectories), + "data/step_trainer_tokens": float(total_trainable_tokens), + "train/num_trajectories": float(total_trajectories), + "train/num_trainable_tokens": float(total_trainable_tokens), + } pbar.close() diff --git a/src/art/metrics.py b/src/art/metrics.py index ea299427..f31e0b6f 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -7,6 +7,7 @@ from dataclasses import dataclass from functools import wraps from inspect import iscoroutinefunction +import time from typing import Any, ParamSpec, TypeVar from .costs import tokens_to_cost @@ -14,6 +15,10 @@ _active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder") _HIERARCHICAL_SECTIONS = {"costs", "time", "data"} +_THROUGHPUT_IDLE_MAPPINGS = { + "throughput/step_trainer_idle_s": "throughput/cum_trainer_idle_s", + "throughput/step_actor_idle_s": "throughput/cum_actor_idle_s", +} _DEFAULT_PROVIDER = "openai" _OPENAI_PROVIDER = "openai" _ANTHROPIC_PROVIDER = "anthropic" @@ -210,6 +215,14 @@ def add_idle_times( if step_actor_idle_s is not None: self.add_metric("throughput/step_actor_idle_s", float(step_actor_idle_s)) + @contextmanager + def measure(self, key: str): + started = time.monotonic() + try: + yield + finally: + self.add_metric(key, time.monotonic() - started) + async def flush(self, step: int) -> dict[str, float]: del step async with self._lock: @@ -237,6 +250,7 @@ async def flush(self, step: int) -> dict[str, float]: len(self._unique_scenario_ids) ) + self._update_throughput_metrics(result) self._step_buffer.clear() return result @@ -369,6 +383,30 @@ def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]: return rollups + def _update_throughput_metrics(self, result: dict[str, float]) -> None: + for step_key, cum_key in _THROUGHPUT_IDLE_MAPPINGS.items(): + if step_key not in result: + continue + next_value = self._cum_state.get(cum_key, 0.0) + result[step_key] + self._cum_state[cum_key] = next_value + result[cum_key] = next_value + + trainer_tokens = self._cum_state.get("data/step_trainer_tokens_cum") + trainer_seconds = self._cum_state.get("time/step_trainer_s_cum") + if ( + trainer_tokens is not None + and trainer_seconds is not None + and trainer_seconds > 0 + ): + result["throughput/avg_trainer_tok_per_s"] = ( + trainer_tokens / trainer_seconds + ) + + actor_tokens = self._cum_state.get("data/step_actor_tokens_cum") + actor_seconds = self._cum_state.get("time/step_actor_s_cum") + if actor_tokens is not None and actor_seconds is not None and actor_seconds > 0: + result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds + def _resolve_token_pricing( self, provider: str | None, diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py index e7d108d4..79a175c3 100644 --- a/src/art/metrics_taxonomy.py +++ b/src/art/metrics_taxonomy.py @@ -1,5 +1,19 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Iterable + +from .trajectories import TrajectoryGroup + TRAIN_GRADIENT_STEPS_KEY = "data/step_num_gradient_steps" +_SCENARIO_ID_CANDIDATE_KEYS = ( + "scenario_id", + "scenario_scenario_id", + "scenario_idx", + "scenario_scenario_idx", +) + TRAIN_METRIC_KEY_RENAMES = { "reward": "reward/mean", "reward_std_dev": "reward/std_dev", @@ -29,3 +43,89 @@ def rename_train_metric_key(metric: str) -> str: def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]: return {rename_train_metric_key(key): float(value) for key, value in metrics.items()} + + +@dataclass(frozen=True) +class TrajectoryBatchSummary: + num_scenarios: int + num_trajectories: int + num_groups_submitted: int + num_groups_trainable: int + scenario_ids: list[str] + + +def summarize_trajectory_groups( + trajectory_groups: Iterable[TrajectoryGroup], +) -> TrajectoryBatchSummary: + groups = list(trajectory_groups) + scenario_ids: list[str] = [] + seen_scenario_ids: set[str] = set() + + for group in groups: + scenario_id = _extract_scenario_id(group) + if scenario_id is None or scenario_id in seen_scenario_ids: + continue + seen_scenario_ids.add(scenario_id) + scenario_ids.append(scenario_id) + + return TrajectoryBatchSummary( + num_scenarios=len(groups), + num_trajectories=sum(len(group.trajectories) + len(group.exceptions) for group in groups), + num_groups_submitted=len(groups), + num_groups_trainable=sum(1 for group in groups if _group_is_trainable(group)), + scenario_ids=scenario_ids, + ) + + +def build_data_metrics_from_summary( + summary: TrajectoryBatchSummary, + *, + include_trainable_groups: bool, +) -> dict[str, float]: + metrics = { + "data/step_num_scenarios": float(summary.num_scenarios), + "data/step_num_trajectories": float(summary.num_trajectories), + "data/step_num_groups_submitted": float(summary.num_groups_submitted), + } + if include_trainable_groups: + metrics["data/step_num_groups_trainable"] = float(summary.num_groups_trainable) + return metrics + + +def build_train_metrics_from_summary( + summary: TrajectoryBatchSummary, +) -> dict[str, float]: + return { + "train/num_groups_submitted": float(summary.num_groups_submitted), + "train/num_groups_trainable": float(summary.num_groups_trainable), + "train/num_trajectories": float(summary.num_trajectories), + } + + +def _group_is_trainable(group: TrajectoryGroup) -> bool: + rewards = [trajectory.reward for trajectory in group.trajectories] + return len(rewards) > 1 and len(set(rewards)) > 1 + + +def _extract_scenario_id(group: TrajectoryGroup) -> str | None: + for metadata in [group.metadata, *(trajectory.metadata for trajectory in group.trajectories)]: + scenario_id = _extract_scenario_id_from_metadata(metadata) + if scenario_id is not None: + return scenario_id + return None + + +def _extract_scenario_id_from_metadata( + metadata: dict[str, Any], +) -> str | None: + for key in _SCENARIO_ID_CANDIDATE_KEYS: + value = metadata.get(key) + if value is not None: + return str(value) + + for key, value in metadata.items(): + if value is None: + continue + if key.endswith("scenario_id") or key.endswith("scenario_idx"): + return str(value) + return None diff --git a/src/art/model.py b/src/art/model.py index f23cad63..c52705c5 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -16,7 +16,12 @@ from . import dev from .costs import CostCalculator from .metrics import MetricsBuilder -from .metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY +from .metrics_taxonomy import ( + TRAIN_GRADIENT_STEPS_KEY, + build_data_metrics_from_summary, + build_train_metrics_from_summary, + summarize_trajectory_groups, +) from .trajectories import Trajectory, TrajectoryGroup from .types import TrainConfig, TrainSFTConfig from .utils.trajectory_logging import write_trajectory_groups_parquet @@ -33,7 +38,7 @@ COSTS_METRIC_PREFIX = "costs_" COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total" METRICS_BUILDER_STATE_KEY = "_metrics_builder_state" -BUILDER_CUMULATIVE_PREFIXES = ("time/step_", "data/step_") +BUILDER_CUMULATIVE_PREFIXES = ("time/step_", "data/step_", "throughput/step_") METRIC_SECTIONS = frozenset( { "reward", @@ -515,6 +520,39 @@ def _extract_non_cost_metrics( non_cost_metrics[metric] = numeric_value return non_cost_metrics + def _add_default_step_metrics( + self, + trajectory_groups: list[TrajectoryGroup], + *, + split: str, + provided_metric_keys: set[str], + ) -> dict[str, float]: + if split not in METRIC_SPLITS: + return {} + + summary = summarize_trajectory_groups(trajectory_groups) + default_data_metrics = build_data_metrics_from_summary( + summary, + include_trainable_groups=split == "train", + ) + for key, value in default_data_metrics.items(): + if key in provided_metric_keys: + continue + self._metrics_builder.add_metric(key, value) + + if summary.scenario_ids: + self._metrics_builder.add_data(scenario_ids=summary.scenario_ids) + + if split != "train": + return {} + + default_train_metrics = build_train_metrics_from_summary(summary) + return { + key: value + for key, value in default_train_metrics.items() + if key not in provided_metric_keys + } + def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder: if cost_context is None: return self._metrics_builder @@ -595,6 +633,12 @@ async def log( else: trajectory_groups = cast(list[TrajectoryGroup], list(trajectories)) + default_train_metrics = self._add_default_step_metrics( + trajectory_groups, + split=split, + provided_metric_keys=set(metrics or {}), + ) + # Ensure output directories exist output_dir = self._get_output_dir() trajectories_dir = f"{output_dir}/trajectories/{split}" @@ -663,6 +707,8 @@ async def log( if len(values) > 0: averages[metric] = sum(values) / len(values) + averages.update(default_train_metrics) + # Aggregate group-level metrics once per group for metric, values in group_metrics.items(): if len(values) > 0: @@ -907,6 +953,7 @@ async def train( # 1. Train (backend no longer logs internally) training_metrics: list[dict[str, float]] = [] + trainer_started = time.monotonic() async for metrics in self.backend()._train_model( self, groups_list, @@ -915,6 +962,7 @@ async def train( verbose, ): training_metrics.append(metrics) + trainer_elapsed = time.monotonic() - trainer_started # 2. Calculate aggregated training metrics avg_metrics: dict[str, float] = {} @@ -925,6 +973,7 @@ async def train( for k in {k for d in training_metrics for k in d} if k != TRAIN_GRADIENT_STEPS_KEY } + avg_metrics.setdefault("time/step_trainer_s", trainer_elapsed) # 3. Log trajectories and training metrics together (single wandb log call) step = await self.get_step() @@ -955,6 +1004,7 @@ async def train_sft( # Collect all metrics and aggregate them at the end (same as RL) _config = _config or {} # ty:ignore[invalid-assignment] training_metrics: list[dict[str, float]] = [] + trainer_started = time.monotonic() async for metrics in self.backend()._train_sft( self, trajectories, @@ -963,6 +1013,7 @@ async def train_sft( verbose, ): training_metrics.append(metrics) + trainer_elapsed = time.monotonic() - trainer_started # Log aggregated training metrics once (same as RL) if training_metrics: @@ -971,6 +1022,7 @@ async def train_sft( / sum(1 for d in training_metrics if k in d) for k in {k for d in training_metrics for k in d} } + avg_metrics["time/step_trainer_s"] = trainer_elapsed # Get the current step after training step = await self.get_step() - self._log_metrics(avg_metrics, "train", step) + await self.log(trajectories=None, split="train", metrics=avg_metrics, step=step) diff --git a/src/art/pipeline_trainer/trainer.py b/src/art/pipeline_trainer/trainer.py index 9dcec1cd..a32ad1b8 100644 --- a/src/art/pipeline_trainer/trainer.py +++ b/src/art/pipeline_trainer/trainer.py @@ -16,6 +16,8 @@ from .types import ConfigT, EvalFn, RolloutFn, ScenarioT, SingleRolloutFn # noqa: F401 PIPELINE_STATE_KEY = "_pipeline_trainer" +_ROLLOUT_WALL_TIME_KEY = "_art_rollout_wall_s" +_ACTOR_IDLE_TIME_KEY = "_art_actor_idle_s" def _to_async_iterator(iterable: Iterable[T] | AsyncIterator[T]) -> AsyncIterator[T]: @@ -322,17 +324,21 @@ async def _rollout_worker(self, worker_id: int) -> None: self._status.note_rollout_started() errored = False try: + wait_started = time.monotonic() await self._wait_for_policy() + actor_idle_s = time.monotonic() - wait_started if self.state.done: break initial_version = self.state.policy_version token = self.model.activate_metrics_context("train") + rollout_started = time.monotonic() try: group = await self.rollout_fn(self.model, scenario, self.config) finally: token.var.reset(token) + rollout_wall_s = time.monotonic() - rollout_started if not isinstance(group, TrajectoryGroup): errored = True continue @@ -344,7 +350,9 @@ async def _rollout_worker(self, worker_id: int) -> None: ) if self.state.done: break - await self._put_output_group(group) + queue_wait_s = await self._put_output_group(group) + group.metadata[_ROLLOUT_WALL_TIME_KEY] = rollout_wall_s + group.metadata[_ACTOR_IDLE_TIME_KEY] = actor_idle_s + queue_wait_s except asyncio.CancelledError: raise except Exception as exc: @@ -383,13 +391,17 @@ async def _training_stage(self) -> None: if stop_at_step is not None and current_step >= stop_at_step: break step_start = time.monotonic() + collect_started = time.monotonic() batch, discarded, saw_sentinel = await self._collect_batch(current_step) + trainer_idle_s = time.monotonic() - collect_started self.state.discarded_stale_samples += discarded if discarded: self._status.note_stale(discarded) if not batch: break + actor_wall_s, actor_idle_s = self._consume_batch_rollout_timings(batch) + expected_step = current_step + 1 should_eval_step = self._should_eval_step(expected_step) should_checkpoint = self.save_checkpoint and should_eval_step @@ -399,10 +411,9 @@ async def _training_stage(self) -> None: self.state.policy_updated.notify_all() self._status.note_training_start(len(batch)) - train_call_start: float | None = None + train_call_start = time.monotonic() if os.getenv("ART_TRAIN_STEP_LOG"): print(f"[train] step {expected_step} starting (batch={len(batch)})") - train_call_start = time.perf_counter() try: result = await self.backend.train( self.model, @@ -418,8 +429,8 @@ async def _training_stage(self) -> None: self._status.note_training_end() raise finally: - if train_call_start is not None: - train_call_elapsed = time.perf_counter() - train_call_start + train_call_elapsed = time.monotonic() - train_call_start + if os.getenv("ART_TRAIN_STEP_LOG"): print( f"[train] step {expected_step} done in " f"{train_call_elapsed:.1f}s" @@ -442,7 +453,14 @@ async def _training_stage(self) -> None: ), "steps_off_policy": steps_off_policy, "num_groups": float(len(batch)), + "time/step_wall_s": step_seconds, + "throughput/step_trainer_idle_s": trainer_idle_s, } + metrics.setdefault("time/step_trainer_s", train_call_elapsed) + if actor_wall_s > 0: + metrics["time/step_actor_s"] = actor_wall_s + if actor_idle_s > 0: + metrics["throughput/step_actor_idle_s"] = actor_idle_s metrics.update(result.metrics) await self.model.log( @@ -565,18 +583,22 @@ async def _run_eval(self, step: int) -> None: assert self.eval_fn is not None self._status.note_val_started(step) reward: float | None = None + eval_elapsed = 0.0 try: token = self.model.activate_metrics_context("eval") + eval_started = time.monotonic() try: result = await self.eval_fn(self.model, step, self.config) finally: token.var.reset(token) + eval_elapsed = time.monotonic() - eval_started splits: dict[str, list[art.Trajectory | art.TrajectoryGroup]] if isinstance(result, dict): splits = result else: splits = {"val": result} + logged_eval_timing = False for split_name, items in splits.items(): groups, trajectories = self._normalize_eval_items(items) if split_name == "val": @@ -585,7 +607,25 @@ async def _run_eval(self, step: int) -> None: else: reward = None if groups: - await self.model.log(groups, split=split_name, step=step) + metrics = ( + {"time/step_eval_s": eval_elapsed} + if not logged_eval_timing + else None + ) + await self.model.log( + groups, + split=split_name, + step=step, + metrics=metrics, + ) + logged_eval_timing = True + if not logged_eval_timing and eval_elapsed > 0: + await self.model.log( + trajectories=None, + split="val", + step=step, + metrics={"time/step_eval_s": eval_elapsed}, + ) except asyncio.CancelledError: raise except Exception as exc: @@ -742,12 +782,31 @@ def _persist_state(self, training_step: int) -> None: def _is_scalar_metadata(value: object) -> bool: return value is None or isinstance(value, (str, int, float, bool)) - async def _put_output_group(self, group: TrajectoryGroup) -> None: + async def _put_output_group(self, group: TrajectoryGroup) -> float: assert self._output_queue is not None + queue_wait_started = time.monotonic() while not self.state.done: try: await asyncio.wait_for(self._output_queue.put(group), timeout=1.0) self._status.note_group_enqueued(group) - return + return time.monotonic() - queue_wait_started except asyncio.TimeoutError: continue + return time.monotonic() - queue_wait_started + + def _consume_batch_rollout_timings( + self, batch: list[TrajectoryGroup] + ) -> tuple[float, float]: + rollout_wall_s = 0.0 + actor_idle_s = 0.0 + for group in batch: + rollout_wall_s += self._pop_float_metadata(group, _ROLLOUT_WALL_TIME_KEY) + actor_idle_s += self._pop_float_metadata(group, _ACTOR_IDLE_TIME_KEY) + return rollout_wall_s, actor_idle_s + + @staticmethod + def _pop_float_metadata(group: TrajectoryGroup, key: str) -> float: + value = group.metadata.pop(key, 0.0) + if isinstance(value, (int, float)): + return float(value) + return 0.0 diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py index f9ab8c5f..ede695cc 100644 --- a/src/art/serverless/backend.py +++ b/src/art/serverless/backend.py @@ -1,4 +1,5 @@ import asyncio +import time from typing import TYPE_CHECKING, Any, AsyncIterator, Iterable, Literal import warnings @@ -9,7 +10,13 @@ from .. import dev from ..backend import AnyTrainableModel, Backend -from ..metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY, rename_train_metrics +from ..metrics_taxonomy import ( + TRAIN_GRADIENT_STEPS_KEY, + build_data_metrics_from_summary, + build_train_metrics_from_summary, + rename_train_metrics, + summarize_trajectory_groups, +) from ..trajectories import Trajectory, TrajectoryGroup from ..types import ServerlessTrainResult, TrainConfig, TrainSFTConfig from ..utils.record_provenance import record_provenance @@ -236,6 +243,7 @@ async def train( # type: ignore[override] # Collect metrics from training training_metrics: list[dict[str, float]] = [] + trainer_started = time.monotonic() async for metrics in self._train_model( model, groups_list, config, dev_config, verbose ): @@ -250,6 +258,22 @@ async def train( # type: ignore[override] for k in {k for d in training_metrics for k in d} if k != TRAIN_GRADIENT_STEPS_KEY } + summary = summarize_trajectory_groups(groups_list) + avg_metrics.setdefault( + "time/step_trainer_s", time.monotonic() - trainer_started + ) + avg_metrics.update( + { + key: value + for key, value in { + **build_data_metrics_from_summary( + summary, include_trainable_groups=True + ), + **build_train_metrics_from_summary(summary), + }.items() + if key not in avg_metrics + } + ) # Get step and artifact name step = await self._get_step(model) @@ -276,6 +300,11 @@ async def _train_model( dev_config: dev.TrainConfig, verbose: bool = False, ) -> AsyncIterator[dict[str, float]]: + summary = summarize_trajectory_groups(trajectory_groups) + base_metrics = { + **build_data_metrics_from_summary(summary, include_trainable_groups=True), + **build_train_metrics_from_summary(summary), + } assert model.id is not None, "Model ID is required" training_job = await self._client.training_jobs.create( # ty:ignore[possibly-missing-attribute] model_id=model.id, @@ -312,6 +341,7 @@ async def _train_model( {k: float(v) for k, v in event.data.items()} ) yield { + **base_metrics, **metrics, TRAIN_GRADIENT_STEPS_KEY: float(num_sequences), } @@ -484,6 +514,8 @@ async def _train_sft( ) yield { **metrics, + "data/step_num_trajectories": float(num_trajectories), + "train/num_trajectories": float(num_trajectories), TRAIN_GRADIENT_STEPS_KEY: float(num_batches), } elif event.type == "training_started": diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py index 19df73dd..0c6a1654 100644 --- a/src/art/tinker_native/backend.py +++ b/src/art/tinker_native/backend.py @@ -30,7 +30,12 @@ from .. import dev from ..backend import Backend from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing -from ..metrics_taxonomy import rename_train_metric_key +from ..metrics_taxonomy import ( + build_data_metrics_from_summary, + build_train_metrics_from_summary, + rename_train_metric_key, + summarize_trajectory_groups, +) from ..model import Model, TrainableModel from ..tinker.backend import get_renderer_name from ..tinker.server import get_free_port @@ -209,6 +214,7 @@ async def train( # type: ignore[override] ) -> TrainResult: state = self._model_state[model.name] groups_list = list(trajectory_groups) + summary = summarize_trajectory_groups(groups_list) datums = trajectory_groups_to_datums( groups_list, @@ -218,7 +224,8 @@ async def train( # type: ignore[override] ) metrics: dict[str, float] = { - "train/num_groups_submitted": float(len(groups_list)), + **build_data_metrics_from_summary(summary, include_trainable_groups=True), + **build_train_metrics_from_summary(summary), "data/step_num_datums": float(len(datums)), } @@ -234,6 +241,7 @@ async def train( # type: ignore[override] metrics["costs/train/tinker_train"] = compute_train_cost( train_tokens, pricing ) + trainer_started = time.monotonic() if adam_params is None: adam_params = tinker.AdamParams( @@ -301,6 +309,7 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum: state.current_step = next_step self._persist_model_state(model, state) + metrics["time/step_trainer_s"] = time.monotonic() - trainer_started return TrainResult(step=state.current_step, metrics=metrics) From 59de04d2e0cb40016bec1edc6c75e1f27ff98261 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 10:56:13 -0700 Subject: [PATCH 19/46] docs: Document auto-emitted taxonomy metrics --- docs/metrics-taxonomy.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md index b5a2294c..300bdf0b 100644 --- a/docs/metrics-taxonomy.md +++ b/docs/metrics-taxonomy.md @@ -47,6 +47,44 @@ ART rolls costs up automatically: - parent rollups (for example `costs/train`, `costs/all`) - cumulative keys with `_cum` suffix (for example `costs/all_cum`) +## Metrics Added By ART + +ART now emits the following metrics from library internals where the data is available: + +- `reward/*` aggregates from `model.log(..., split="train")` +- `loss/*` from trainer backends +- `time/wall_clock_sec` and `training_step` on every logged row +- `time/step_trainer_s` for training calls +- `time/step_wall_s`, `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer` +- `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted` +- `data/step_num_groups_trainable` for train splits +- `data/cum_num_unique_scenarios` when scenario IDs are present in group or trajectory metadata +- `data/step_trainer_tokens` where the backend knows the trainer token count +- `throughput/cum_trainer_idle_s`, `throughput/cum_actor_idle_s` +- `throughput/avg_trainer_tok_per_s`, `throughput/avg_actor_tok_per_s` when both token and time inputs are available + +Some metrics remain user-owned because ART cannot infer them reliably for every workflow, especially actor token usage outside the pipeline trainer. + +## User Helpers + +Use the builder helpers for step-level metrics that only user code can know: + +```python +builder = model.metrics_builder() + +with builder.measure("time/step_actor_s"): + result = await run_rollouts() + +builder.add_data( + step_actor_tokens=result.actor_tokens, + scenario_ids=result.scenario_ids, +) + +builder.add_idle_times(step_actor_idle_s=result.actor_idle_s) +``` + +If these metrics are logged before the next `model.log(...)` flush, ART will also emit the cumulative and derived throughput metrics automatically. + ## End-to-End Smoke Test Run: From a9ce32fab14bd7fe02bd7213164694c074e4a519 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 14:04:04 -0700 Subject: [PATCH 20/46] feat: Add yes-no-maybe metrics example --- dev/yes-no-maybe-metrics.py | 261 ++++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 dev/yes-no-maybe-metrics.py diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py new file mode 100644 index 00000000..32729990 --- /dev/null +++ b/dev/yes-no-maybe-metrics.py @@ -0,0 +1,261 @@ +"""Yes-no-maybe metrics demo for the LocalBackend `model.train()` path. + +This keeps the same prompt family, rollout structure, and reward ordering as +`dev/yes-no-maybe.py` while adding explicit metrics taxonomy instrumentation for +actor/eval timing and data metrics. +""" + +from __future__ import annotations + +import asyncio +from itertools import permutations +import os +import time + +from dotenv import load_dotenv +import openai + +try: + import unsloth # noqa: F401 +except ImportError: + pass + +import art +from art.local import LocalBackend + + +async def create_chat_completion( + client: openai.AsyncOpenAI, + *, + model_name: str, + messages: art.Messages, + max_tokens: int, + timeout: float, +) -> openai.types.chat.chat_completion.ChatCompletion: + return await client.chat.completions.create( + messages=messages, + model=model_name, + max_tokens=max_tokens, + timeout=timeout, + ) + + +def with_quotes(word: str) -> str: + return f"'{word}'" + + +def build_prompts() -> list[str]: + return [ + f"{prefix} with {', '.join([with_quotes(word) if use_quotes else word for word in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}" + for prefix in ["respond", "just respond"] + for use_quotes in [True, False] + for words in ( + list(permutation) + for length in [3, 2] + for permutation in permutations(["yes", "no", "maybe"], length) + ) + ] + + +def reward_for_answer(content: str | None) -> float: + if content == "yes": + return 0.5 + if content == "no": + return 0.75 + if content == "maybe": + return 1.0 + return 0.0 + + +def scenario_id_for_prompt(prompt: str) -> str: + return prompt.replace(" ", "_").replace("'", "") + + +def response_total_tokens( + response: openai.types.chat.chat_completion.ChatCompletion, +) -> int: + usage = response.usage + if usage is None: + return 0 + prompt_tokens = int(usage.prompt_tokens or 0) + completion_tokens = int(usage.completion_tokens or 0) + return prompt_tokens + completion_tokens + + +def total_actor_tokens(groups: list[art.TrajectoryGroup]) -> int: + return sum( + int(trajectory.metadata.get("actor_total_tokens", 0) or 0) + for group in groups + for trajectory in group.trajectories + ) + + +async def rollout( + client: openai.AsyncOpenAI, + model: art.TrainableModel, + prompt: str, + *, + max_tokens: int, + timeout: float, +) -> art.Trajectory: + messages: art.Messages = [{"role": "user", "content": prompt}] + chat_completion = await create_chat_completion( + client, + model_name=model.get_inference_name(), + messages=messages, + max_tokens=max_tokens, + timeout=timeout, + ) + choice = chat_completion.choices[0] + content = choice.message.content + return art.Trajectory( + messages_and_choices=[*messages, choice], + reward=reward_for_answer(content), + metadata={ + "scenario_id": scenario_id_for_prompt(prompt), + "actor_total_tokens": response_total_tokens(chat_completion), + }, + metrics={ + "valid_answer": reward_for_answer(content) > 0.0, + }, + ) + + +async def evaluate( + client: openai.AsyncOpenAI, + model: art.TrainableModel, + prompts: list[str], + *, + max_tokens: int, + timeout: float, +) -> list[art.TrajectoryGroup]: + groups = await art.gather_trajectory_groups( + art.TrajectoryGroup( + [ + rollout( + client, + model, + prompt, + max_tokens=max_tokens, + timeout=timeout, + ) + ], + metadata={"scenario_id": scenario_id_for_prompt(prompt)}, + ) + for prompt in prompts + ) + return groups + + +def print_history_summary(model: art.TrainableModel) -> None: + history_path = ( + model.base_path + f"/{model.project}/models/{model.name}/history.jsonl" + ) + print(f"History: {history_path}") + + +def build_internal_config() -> art.dev.InternalModelConfig: + return art.dev.InternalModelConfig( + engine_args=art.dev.EngineArgs( + gpu_memory_utilization=float( + os.environ.get("GPU_MEMORY_UTILIZATION", "0.85") + ), + max_model_len=int(os.environ.get("MAX_MODEL_LEN", "4096")), + ) + ) + + +async def main() -> None: + load_dotenv() + + backend = LocalBackend() + base_model = os.environ.get("BASE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507") + project = os.environ.get("PROJECT", "yes-no-maybe-metrics") + model = art.TrainableModel( + name=os.environ.get( + "MODEL_NAME", f"yes-no-maybe-metrics-{int(time.time())}" + ), + project=project, + base_model=base_model, + report_metrics=["wandb"], + _internal_config=build_internal_config(), + ) + try: + await model.register(backend) + + prompts = build_prompts() + eval_prompts = prompts[: int(os.environ.get("EVAL_PROMPTS", "12"))] + openai_client = model.openai_client() + max_steps = int(os.environ.get("NUM_STEPS", "20")) + rollouts_per_prompt = int(os.environ.get("ROLLOUTS_PER_PROMPT", "32")) + max_tokens = int(os.environ.get("MAX_TOKENS", "100")) + timeout = float(os.environ.get("TIMEOUT", "100")) + eval_every_n_steps = int(os.environ.get("EVAL_EVERY_N_STEPS", "1")) + learning_rate = float(os.environ.get("LEARNING_RATE", "1e-4")) + + start_step = await model.get_step() + for offset in range(max_steps): + current_step = start_step + offset + + if ( + eval_every_n_steps > 0 + and (current_step - start_step) % eval_every_n_steps == 0 + ): + eval_builder = model.metrics_builder("eval") + with eval_builder.activate_context(): + with eval_builder.measure("time/step_eval_s"): + val_groups = await evaluate( + openai_client, + model, + eval_prompts, + max_tokens=max_tokens, + timeout=timeout, + ) + eval_builder.add_data( + step_actor_tokens=total_actor_tokens(val_groups) + ) + await model.log(val_groups, split="val", step=current_step) + + train_builder = model.metrics_builder("train") + step_started = time.monotonic() + with train_builder.activate_context(): + with train_builder.measure("time/step_actor_s"): + train_groups = await art.gather_trajectory_groups( + ( + art.TrajectoryGroup( + rollout( + openai_client, + model, + prompt, + max_tokens=max_tokens, + timeout=timeout, + ) + for _ in range(rollouts_per_prompt) + ) + for prompt in prompts + ) + ) + train_builder.add_data( + step_actor_tokens=total_actor_tokens(train_groups) + ) + await model.train( + train_groups, + config=art.TrainConfig(learning_rate=learning_rate), + ) + + step = await model.get_step() + await model.log( + trajectories=None, + split="train", + step=step, + metrics={"time/step_wall_s": time.monotonic() - step_started}, + ) + print(f"step {step} complete") + + print_history_summary(model) + finally: + await backend.close() + + +if __name__ == "__main__": + asyncio.run(main()) From 68281b5235bd80b56e51dda9a48cb53401fad777 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 14:26:54 -0700 Subject: [PATCH 21/46] fix: Load MetricsBuilder state before builder access --- src/art/model.py | 1 + tests/unit/test_frontend_logging.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/src/art/model.py b/src/art/model.py index c52705c5..dcbdc5a6 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -554,6 +554,7 @@ def _add_default_step_metrics( } def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder: + self._load_metrics_builder_state() if cost_context is None: return self._metrics_builder return self._metrics_builder.for_cost_context(cost_context) diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 3230af03..5719009c 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -645,6 +645,36 @@ async def test_cost_cumulative_persists_across_model_recreation( assert second["costs/train/prefill_cum"] == pytest.approx(1.0) assert second["costs/all_cum"] == pytest.approx(1.0) + @pytest.mark.asyncio + async def test_metrics_builder_loads_resume_state_before_builder_use( + self, tmp_path: Path + ): + model_1 = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + model_1.metrics_builder().add_data(scenario_ids=["scenario-a"]) + await model_1.log(trajectories=None, split="train", step=1, metrics={}) + + model_2 = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + model_2.metrics_builder().add_data(scenario_ids=["scenario-b"]) + await model_2.log(trajectories=None, split="train", step=2, metrics={}) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + first = json.loads(f.readline()) + second = json.loads(f.readline()) + + assert first["data/cum_num_unique_scenarios"] == pytest.approx(1.0) + assert second["data/cum_num_unique_scenarios"] == pytest.approx(2.0) + @pytest.mark.asyncio async def test_direct_time_and_data_metrics_get_cumulative_variants( self, tmp_path: Path From 7b8d8f9d3acd961cd05e12c77d21308b14c42b9b Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 14:28:25 -0700 Subject: [PATCH 22/46] fix: Preserve gradient step metrics in train outputs --- src/art/local/backend.py | 2 +- src/art/model.py | 1 - src/art/serverless/backend.py | 1 - tests/unit/test_frontend_logging.py | 91 +++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 3 deletions(-) diff --git a/src/art/local/backend.py b/src/art/local/backend.py index 6ff4a53a..bdf44179 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -588,7 +588,6 @@ async def train( # type: ignore[override] k: sum(d.get(k, 0) for d in training_metrics) / sum(1 for d in training_metrics if k in d) for k in {k for d in training_metrics for k in d} - if k != TRAIN_GRADIENT_STEPS_KEY } summary = summarize_trajectory_groups(groups_list) avg_metrics.setdefault( @@ -838,6 +837,7 @@ async def _train_sft( **result, "data/step_num_trajectories": float(total_trajectories), "data/step_trainer_tokens": float(total_trainable_tokens), + TRAIN_GRADIENT_STEPS_KEY: float(len(batches)), "train/num_trajectories": float(total_trajectories), "train/num_trainable_tokens": float(total_trainable_tokens), } diff --git a/src/art/model.py b/src/art/model.py index dcbdc5a6..4d6b4e32 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -972,7 +972,6 @@ async def train( k: sum(d.get(k, 0) for d in training_metrics) / sum(1 for d in training_metrics if k in d) for k in {k for d in training_metrics for k in d} - if k != TRAIN_GRADIENT_STEPS_KEY } avg_metrics.setdefault("time/step_trainer_s", trainer_elapsed) diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py index ede695cc..5dd7cb0f 100644 --- a/src/art/serverless/backend.py +++ b/src/art/serverless/backend.py @@ -256,7 +256,6 @@ async def train( # type: ignore[override] k: sum(d.get(k, 0) for d in training_metrics) / sum(1 for d in training_metrics if k in d) for k in {k for d in training_metrics for k in d} - if k != TRAIN_GRADIENT_STEPS_KEY } summary = summarize_trajectory_groups(groups_list) avg_metrics.setdefault( diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 5719009c..b8d38852 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -17,6 +17,8 @@ import pytest from art import Model, TrainableModel, Trajectory, TrajectoryGroup +from art.local.backend import LocalBackend +from art.metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY from art.utils.trajectory_logging import read_trajectory_groups_parquet @@ -965,3 +967,92 @@ async def mock_train_sft(*args, **kwargs): assert not history_path.exists(), ( "No history.jsonl should be created for empty training" ) + + +class TestGradientStepMetrics: + @pytest.mark.asyncio + async def test_model_train_logs_gradient_step_count(self, tmp_path: Path): + model = TrainableModel( + name="test-train", + project="test-project", + base_model="gpt-4", + base_path=str(tmp_path), + report_metrics=[], + ) + + async def mock_train_model(*args, **kwargs): + for loss in (1.0, 0.8, 0.6): + yield { + "loss/train": loss, + TRAIN_GRADIENT_STEPS_KEY: 3.0, + } + + mock_backend = MagicMock() + mock_backend._train_model = mock_train_model + mock_backend._get_step = AsyncMock(return_value=1) + model._backend = mock_backend + + groups = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=1.0, + messages_and_choices=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ], + ) + ] + ) + ] + + await model.train(groups) + + history_path = tmp_path / "test-project/models/test-train/history.jsonl" + rows = [json.loads(line) for line in history_path.open() if line.strip()] + merged: dict[str, float] = {} + for row in rows: + merged.update(row) + + assert merged[TRAIN_GRADIENT_STEPS_KEY] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_local_backend_train_returns_gradient_step_count( + self, tmp_path: Path + ): + model = TrainableModel( + name="test-backend-train", + project="test-project", + base_model="gpt-4", + base_path=str(tmp_path), + report_metrics=[], + ) + backend = LocalBackend(path=str(tmp_path)) + + async def mock_train_model(*args, **kwargs): + for loss in (1.0, 0.8): + yield { + "loss/train": loss, + TRAIN_GRADIENT_STEPS_KEY: 2.0, + } + + backend._train_model = mock_train_model # type: ignore[method-assign] + backend._get_step = AsyncMock(return_value=1) # type: ignore[method-assign] + + groups = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=1.0, + messages_and_choices=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ], + ) + ] + ) + ] + + result = await backend.train(model, groups, save_checkpoint=False) + + assert result.metrics[TRAIN_GRADIENT_STEPS_KEY] == pytest.approx(2.0) From 004d6106780dc424416461ce3467e3a18bfc61b6 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 14:30:02 -0700 Subject: [PATCH 23/46] fix: Skip stale MetricsBuilder flush outputs --- src/art/metrics.py | 46 ++++++++++++++++++++--------- tests/unit/test_frontend_logging.py | 38 ++++++++++++++++++++++++ tests/unit/test_metrics_builder.py | 14 +++++++++ 3 files changed, 84 insertions(+), 14 deletions(-) diff --git a/src/art/metrics.py b/src/art/metrics.py index f31e0b6f..b65157a0 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -51,6 +51,7 @@ class _SharedMetricsState: step_buffer: dict[str, float] cum_state: dict[str, float] unique_scenario_ids: set[str] + pending_scenario_ids: set[str] cost_extractors: dict[str, CostExtractor] token_pricing: dict[str, TokenPricing] @@ -61,6 +62,7 @@ def _new_shared_metrics_state() -> _SharedMetricsState: step_buffer={}, cum_state={}, unique_scenario_ids=set(), + pending_scenario_ids=set(), cost_extractors={}, token_pricing=dict(_DEFAULT_TOKEN_PRICING), ) @@ -162,6 +164,7 @@ def __init__( self._step_buffer = self._shared_state.step_buffer self._cum_state = self._shared_state.cum_state self._unique_scenario_ids = self._shared_state.unique_scenario_ids + self._pending_scenario_ids = self._shared_state.pending_scenario_ids self._cost_extractors = self._shared_state.cost_extractors self._token_pricing = self._shared_state.token_pricing @@ -187,7 +190,9 @@ def add_data( if step_actor_tokens is not None: self.add_metric("data/step_actor_tokens", float(step_actor_tokens)) if scenario_ids is not None: - self._unique_scenario_ids.update(scenario_ids) + self._pending_scenario_ids.update( + str(scenario_id) for scenario_id in scenario_ids + ) def add_user_timing( self, @@ -245,13 +250,15 @@ async def flush(self, step: int) -> dict[str, float]: self._cum_state[cum_key] = next_value result[cum_key] = next_value - if self._unique_scenario_ids: + if self._pending_scenario_ids: + self._unique_scenario_ids.update(self._pending_scenario_ids) result["data/cum_num_unique_scenarios"] = float( len(self._unique_scenario_ids) ) self._update_throughput_metrics(result) self._step_buffer.clear() + self._pending_scenario_ids.clear() return result def activate(self) -> Token["MetricsBuilder"]: @@ -319,11 +326,13 @@ def load_state_dict(self, state: dict[str, Any]) -> None: self._shared_state.cum_state.update(restored_cum_state) self._shared_state.unique_scenario_ids.clear() self._shared_state.unique_scenario_ids.update(restored_unique_ids) + self._shared_state.pending_scenario_ids.clear() # Keep local references aligned with the shared state so derived builders # created before or after resume observe the same cumulative state. self._cum_state = self._shared_state.cum_state self._unique_scenario_ids = self._shared_state.unique_scenario_ids + self._pending_scenario_ids = self._shared_state.pending_scenario_ids def _validate_and_add(self, key: str, value: float) -> None: if key.endswith("_cum"): @@ -391,21 +400,30 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None: self._cum_state[cum_key] = next_value result[cum_key] = next_value - trainer_tokens = self._cum_state.get("data/step_trainer_tokens_cum") - trainer_seconds = self._cum_state.get("time/step_trainer_s_cum") if ( - trainer_tokens is not None - and trainer_seconds is not None - and trainer_seconds > 0 + "data/step_trainer_tokens" in result + or "time/step_trainer_s" in result ): - result["throughput/avg_trainer_tok_per_s"] = ( - trainer_tokens / trainer_seconds - ) + trainer_tokens = self._cum_state.get("data/step_trainer_tokens_cum") + trainer_seconds = self._cum_state.get("time/step_trainer_s_cum") + if ( + trainer_tokens is not None + and trainer_seconds is not None + and trainer_seconds > 0 + ): + result["throughput/avg_trainer_tok_per_s"] = ( + trainer_tokens / trainer_seconds + ) - actor_tokens = self._cum_state.get("data/step_actor_tokens_cum") - actor_seconds = self._cum_state.get("time/step_actor_s_cum") - if actor_tokens is not None and actor_seconds is not None and actor_seconds > 0: - result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds + if "data/step_actor_tokens" in result or "time/step_actor_s" in result: + actor_tokens = self._cum_state.get("data/step_actor_tokens_cum") + actor_seconds = self._cum_state.get("time/step_actor_s_cum") + if ( + actor_tokens is not None + and actor_seconds is not None + and actor_seconds > 0 + ): + result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds def _resolve_token_pricing( self, diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index b8d38852..6d515b92 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -707,6 +707,44 @@ async def test_direct_time_and_data_metrics_get_cumulative_variants( assert entry["data/step_actor_tokens"] == pytest.approx(10) assert entry["data/step_actor_tokens_cum"] == pytest.approx(10) + @pytest.mark.asyncio + async def test_log_without_new_builder_metrics_skips_extra_taxonomy_row( + self, tmp_path: Path + ): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + model.metrics_builder().add_data(scenario_ids=["scenario-a"]) + await model.log( + trajectories=None, + split="train", + step=1, + metrics={ + "time/step_trainer_s": 2.0, + "data/step_trainer_tokens": 20.0, + }, + ) + await model.log( + trajectories=None, + split="train", + step=2, + metrics={"loss/train": 1.0}, + ) + + history_path = tmp_path / "test/models/test/history.jsonl" + rows = [json.loads(line) for line in history_path.open() if line.strip()] + + assert len(rows) == 2 + assert rows[0]["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) + assert rows[0]["data/cum_num_unique_scenarios"] == pytest.approx(1.0) + assert rows[1]["loss/train"] == pytest.approx(1.0) + assert "throughput/avg_trainer_tok_per_s" not in rows[1] + assert "data/cum_num_unique_scenarios" not in rows[1] + class TestWandbIntegration: """Test wandb integration logic (without mocking wandb itself).""" diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py index 6b032c05..1746c8f2 100644 --- a/tests/unit/test_metrics_builder.py +++ b/tests/unit/test_metrics_builder.py @@ -193,6 +193,20 @@ async def test_unique_scenario_count_tracks_exact_ids(self) -> None: second = await builder.flush(step=2) assert second["data/cum_num_unique_scenarios"] == 4 + @pytest.mark.asyncio + async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_metric("time/step_trainer_s", 2.0) + builder.add_metric("data/step_trainer_tokens", 20.0) + builder.add_data(scenario_ids=["s1"]) + + first = await builder.flush(step=1) + assert first["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) + assert first["data/cum_num_unique_scenarios"] == 1 + + second = await builder.flush(step=2) + assert second == {} + @pytest.mark.asyncio async def test_concurrent_add_cost_calls_do_not_lose_updates(self) -> None: builder = MetricsBuilder(cost_context="train") From b9733666df6514e8228ddf5812dbbf36b6b8d6a0 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 14:48:36 -0700 Subject: [PATCH 24/46] fix: Normalize Model.log inputs once --- src/art/model.py | 43 ++++++++++++--------- tests/unit/test_frontend_logging.py | 59 +++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 18 deletions(-) diff --git a/src/art/model.py b/src/art/model.py index 4d6b4e32..40818465 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -578,6 +578,25 @@ def _persist_metrics_builder_state(self) -> None: {METRICS_BUILDER_STATE_KEY: self._metrics_builder.state_dict()} ) + def _normalize_trajectory_groups( + self, + trajectories: Iterable[Trajectory | BaseException] | Iterable[TrajectoryGroup], + ) -> list[TrajectoryGroup]: + items = list(trajectories) + if not items: + return [] + + if all(isinstance(item, TrajectoryGroup) for item in items): + return cast(list[TrajectoryGroup], items) + + if all(isinstance(item, (Trajectory, BaseException)) for item in items): + return [TrajectoryGroup(cast(Iterable[Trajectory | BaseException], items))] + + raise TypeError( + "trajectories must be an iterable of TrajectoryGroup objects or " + "an iterable of Trajectory/BaseException items" + ) + async def log( self, trajectories: ( @@ -622,17 +641,7 @@ async def log( self._persist_metrics_builder_state() return - # Convert to list[TrajectoryGroup] - if any(isinstance(t, Trajectory) for t in trajectories) or any( - isinstance(t, BaseException) for t in trajectories - ): - trajectory_groups = [ - TrajectoryGroup( - cast(Iterable[Trajectory | BaseException], trajectories) - ) - ] - else: - trajectory_groups = cast(list[TrajectoryGroup], list(trajectories)) + trajectory_groups = self._normalize_trajectory_groups(trajectories) default_train_metrics = self._add_default_step_metrics( trajectory_groups, @@ -676,13 +685,11 @@ async def log( if metric not in group_metrics: group_metrics[metric] = [] group_metrics[metric].append(float(value)) - for trajectory in group: - if isinstance(trajectory, BaseException): - all_metrics[exception_rate_key].append(1) - continue - else: - all_metrics[exception_rate_key].append(0) - # Add reward metric + + all_metrics[exception_rate_key].extend(0.0 for _ in group.trajectories) + all_metrics[exception_rate_key].extend(1.0 for _ in group.exceptions) + + for trajectory in group.trajectories: all_metrics[reward_key].append(trajectory.reward) # Collect other custom metrics diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 6d515b92..7f689c42 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -470,6 +470,65 @@ async def test_exception_rate_calculation(self, tmp_path: Path): # All successful trajectories = 0% exception rate assert entry["val/exception_rate"] == 0.0 + @pytest.mark.asyncio + async def test_exception_rate_counts_group_exceptions(self, tmp_path: Path): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + trajectory_groups = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.5, + messages_and_choices=[{"role": "user", "content": "test"}], + ) + ], + exceptions=[ValueError("boom")], + ) + ] + + await model.log(trajectory_groups, split="val") + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["val/exception_rate"] == pytest.approx(0.5) + + @pytest.mark.asyncio + async def test_generator_of_trajectories_is_consumed_once(self, tmp_path: Path): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + def trajectories(): + yield Trajectory( + reward=1.0, + metrics={"custom": 1.0}, + messages_and_choices=[{"role": "user", "content": "first"}], + ) + yield Trajectory( + reward=3.0, + metrics={"custom": 3.0}, + messages_and_choices=[{"role": "user", "content": "second"}], + ) + + await model.log(trajectories(), split="val") + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["val/reward"] == pytest.approx(2.0) + assert entry["val/custom"] == pytest.approx(2.0) + @pytest.mark.asyncio async def test_train_trajectory_metrics_default_to_reward_prefix( self, tmp_path: Path From 18417e3b74685d29d6716dbe24b20b457d97d8c6 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 14:49:41 -0700 Subject: [PATCH 25/46] refactor: Share training metric aggregation helpers --- src/art/local/backend.py | 30 ++++++++-------------- src/art/metrics_taxonomy.py | 26 +++++++++++++++++++ src/art/model.py | 15 +++-------- src/art/serverless/backend.py | 30 ++++++++-------------- src/art/tinker_native/backend.py | 9 ++++--- tests/unit/test_metrics_taxonomy.py | 40 +++++++++++++++++++++++++++++ 6 files changed, 96 insertions(+), 54 deletions(-) create mode 100644 tests/unit/test_metrics_taxonomy.py diff --git a/src/art/local/backend.py b/src/art/local/backend.py index bdf44179..1d839cf7 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -42,8 +42,8 @@ from ..backend import AnyTrainableModel, Backend from ..metrics_taxonomy import ( TRAIN_GRADIENT_STEPS_KEY, - build_data_metrics_from_summary, - build_train_metrics_from_summary, + average_metric_samples, + build_training_summary_metrics, rename_train_metrics, summarize_trajectory_groups, ) @@ -582,13 +582,7 @@ async def train( # type: ignore[override] training_metrics.append(metrics) # Aggregate metrics - avg_metrics: dict[str, float] = {} - if training_metrics: - avg_metrics = { - k: sum(d.get(k, 0) for d in training_metrics) - / sum(1 for d in training_metrics if k in d) - for k in {k for d in training_metrics for k in d} - } + avg_metrics = average_metric_samples(training_metrics) summary = summarize_trajectory_groups(groups_list) avg_metrics.setdefault( "time/step_trainer_s", time.monotonic() - trainer_started @@ -596,12 +590,10 @@ async def train( # type: ignore[override] avg_metrics.update( { key: value - for key, value in { - **build_data_metrics_from_summary( - summary, include_trainable_groups=True - ), - **build_train_metrics_from_summary(summary), - }.items() + for key, value in build_training_summary_metrics( + summary, + include_trainable_groups=True, + ).items() if key not in avg_metrics } ) @@ -643,10 +635,10 @@ async def _train_model( print("Packing tensors...") summary = summarize_trajectory_groups(trajectory_groups) - base_metrics = { - **build_data_metrics_from_summary(summary, include_trainable_groups=True), - **build_train_metrics_from_summary(summary), - } + base_metrics = build_training_summary_metrics( + summary, + include_trainable_groups=True, + ) packed_tensors = self._get_packed_tensors( model, diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py index 79a175c3..061e0872 100644 --- a/src/art/metrics_taxonomy.py +++ b/src/art/metrics_taxonomy.py @@ -45,6 +45,18 @@ def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]: return {rename_train_metric_key(key): float(value) for key, value in metrics.items()} +def average_metric_samples(metric_samples: Iterable[dict[str, float]]) -> dict[str, float]: + totals: dict[str, float] = {} + counts: dict[str, int] = {} + + for sample in metric_samples: + for key, value in sample.items(): + totals[key] = totals.get(key, 0.0) + float(value) + counts[key] = counts.get(key, 0) + 1 + + return {key: totals[key] / counts[key] for key in totals} + + @dataclass(frozen=True) class TrajectoryBatchSummary: num_scenarios: int @@ -102,6 +114,20 @@ def build_train_metrics_from_summary( } +def build_training_summary_metrics( + summary: TrajectoryBatchSummary, + *, + include_trainable_groups: bool, +) -> dict[str, float]: + return { + **build_data_metrics_from_summary( + summary, + include_trainable_groups=include_trainable_groups, + ), + **build_train_metrics_from_summary(summary), + } + + def _group_is_trainable(group: TrajectoryGroup) -> bool: rewards = [trajectory.reward for trajectory in group.trajectories] return len(rewards) > 1 and len(set(rewards)) > 1 diff --git a/src/art/model.py b/src/art/model.py index 40818465..625408d1 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -18,6 +18,7 @@ from .metrics import MetricsBuilder from .metrics_taxonomy import ( TRAIN_GRADIENT_STEPS_KEY, + average_metric_samples, build_data_metrics_from_summary, build_train_metrics_from_summary, summarize_trajectory_groups, @@ -973,13 +974,7 @@ async def train( trainer_elapsed = time.monotonic() - trainer_started # 2. Calculate aggregated training metrics - avg_metrics: dict[str, float] = {} - if training_metrics: - avg_metrics = { - k: sum(d.get(k, 0) for d in training_metrics) - / sum(1 for d in training_metrics if k in d) - for k in {k for d in training_metrics for k in d} - } + avg_metrics = average_metric_samples(training_metrics) avg_metrics.setdefault("time/step_trainer_s", trainer_elapsed) # 3. Log trajectories and training metrics together (single wandb log call) @@ -1024,11 +1019,7 @@ async def train_sft( # Log aggregated training metrics once (same as RL) if training_metrics: - avg_metrics = { - k: sum(d.get(k, 0) for d in training_metrics) - / sum(1 for d in training_metrics if k in d) - for k in {k for d in training_metrics for k in d} - } + avg_metrics = average_metric_samples(training_metrics) avg_metrics["time/step_trainer_s"] = trainer_elapsed # Get the current step after training step = await self.get_step() diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py index 5dd7cb0f..d0589f7f 100644 --- a/src/art/serverless/backend.py +++ b/src/art/serverless/backend.py @@ -12,8 +12,8 @@ from ..backend import AnyTrainableModel, Backend from ..metrics_taxonomy import ( TRAIN_GRADIENT_STEPS_KEY, - build_data_metrics_from_summary, - build_train_metrics_from_summary, + average_metric_samples, + build_training_summary_metrics, rename_train_metrics, summarize_trajectory_groups, ) @@ -250,13 +250,7 @@ async def train( # type: ignore[override] training_metrics.append(metrics) # Aggregate metrics - avg_metrics: dict[str, float] = {} - if training_metrics: - avg_metrics = { - k: sum(d.get(k, 0) for d in training_metrics) - / sum(1 for d in training_metrics if k in d) - for k in {k for d in training_metrics for k in d} - } + avg_metrics = average_metric_samples(training_metrics) summary = summarize_trajectory_groups(groups_list) avg_metrics.setdefault( "time/step_trainer_s", time.monotonic() - trainer_started @@ -264,12 +258,10 @@ async def train( # type: ignore[override] avg_metrics.update( { key: value - for key, value in { - **build_data_metrics_from_summary( - summary, include_trainable_groups=True - ), - **build_train_metrics_from_summary(summary), - }.items() + for key, value in build_training_summary_metrics( + summary, + include_trainable_groups=True, + ).items() if key not in avg_metrics } ) @@ -300,10 +292,10 @@ async def _train_model( verbose: bool = False, ) -> AsyncIterator[dict[str, float]]: summary = summarize_trajectory_groups(trajectory_groups) - base_metrics = { - **build_data_metrics_from_summary(summary, include_trainable_groups=True), - **build_train_metrics_from_summary(summary), - } + base_metrics = build_training_summary_metrics( + summary, + include_trainable_groups=True, + ) assert model.id is not None, "Model ID is required" training_job = await self._client.training_jobs.create( # ty:ignore[possibly-missing-attribute] model_id=model.id, diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py index 0c6a1654..aeb41e1c 100644 --- a/src/art/tinker_native/backend.py +++ b/src/art/tinker_native/backend.py @@ -31,8 +31,7 @@ from ..backend import Backend from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing from ..metrics_taxonomy import ( - build_data_metrics_from_summary, - build_train_metrics_from_summary, + build_training_summary_metrics, rename_train_metric_key, summarize_trajectory_groups, ) @@ -224,8 +223,10 @@ async def train( # type: ignore[override] ) metrics: dict[str, float] = { - **build_data_metrics_from_summary(summary, include_trainable_groups=True), - **build_train_metrics_from_summary(summary), + **build_training_summary_metrics( + summary, + include_trainable_groups=True, + ), "data/step_num_datums": float(len(datums)), } diff --git a/tests/unit/test_metrics_taxonomy.py b/tests/unit/test_metrics_taxonomy.py new file mode 100644 index 00000000..45085476 --- /dev/null +++ b/tests/unit/test_metrics_taxonomy.py @@ -0,0 +1,40 @@ +import pytest + +from art.metrics_taxonomy import ( + TrajectoryBatchSummary, + average_metric_samples, + build_training_summary_metrics, +) + + +def test_average_metric_samples_handles_sparse_keys() -> None: + averaged = average_metric_samples( + [ + {"loss/train": 1.0, "loss/grad_norm": 0.5}, + {"loss/train": 0.5}, + {"loss/grad_norm": 1.0}, + ] + ) + + assert averaged["loss/train"] == pytest.approx(0.75) + assert averaged["loss/grad_norm"] == pytest.approx(0.75) + + +def test_build_training_summary_metrics_includes_data_and_train_sections() -> None: + summary = TrajectoryBatchSummary( + num_scenarios=2, + num_trajectories=5, + num_groups_submitted=2, + num_groups_trainable=1, + scenario_ids=["a", "b"], + ) + + metrics = build_training_summary_metrics( + summary, + include_trainable_groups=True, + ) + + assert metrics["data/step_num_scenarios"] == pytest.approx(2.0) + assert metrics["data/step_num_groups_trainable"] == pytest.approx(1.0) + assert metrics["train/num_groups_submitted"] == pytest.approx(2.0) + assert metrics["train/num_trajectories"] == pytest.approx(5.0) From ffe815e2a5d94937b84e4b82aeabfed18b0cc473 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 14:52:22 -0700 Subject: [PATCH 26/46] refactor: Simplify MetricsBuilder state access --- src/art/metrics.py | 84 +++++++++++++++--------------- src/art/model.py | 4 +- tests/unit/test_metrics_builder.py | 40 +++++++------- tests/unit/test_track_api_cost.py | 8 +-- 4 files changed, 68 insertions(+), 68 deletions(-) diff --git a/src/art/metrics.py b/src/art/metrics.py index b65157a0..809d5061 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -160,13 +160,6 @@ def __init__( self._shared_state = ( _shared_state if _shared_state is not None else _new_shared_metrics_state() ) - self._lock = self._shared_state.lock - self._step_buffer = self._shared_state.step_buffer - self._cum_state = self._shared_state.cum_state - self._unique_scenario_ids = self._shared_state.unique_scenario_ids - self._pending_scenario_ids = self._shared_state.pending_scenario_ids - self._cost_extractors = self._shared_state.cost_extractors - self._token_pricing = self._shared_state.token_pricing def add_cost(self, path: str, usd: float) -> None: if not path: @@ -190,7 +183,7 @@ def add_data( if step_actor_tokens is not None: self.add_metric("data/step_actor_tokens", float(step_actor_tokens)) if scenario_ids is not None: - self._pending_scenario_ids.update( + self._shared_state.pending_scenario_ids.update( str(scenario_id) for scenario_id in scenario_ids ) @@ -228,15 +221,14 @@ def measure(self, key: str): finally: self.add_metric(key, time.monotonic() - started) - async def flush(self, step: int) -> dict[str, float]: - del step - async with self._lock: + async def flush(self) -> dict[str, float]: + async with self._shared_state.lock: self._validate_hierarchy() - result = dict(self._step_buffer) + result = dict(self._shared_state.step_buffer) cost_metrics = { key: value - for key, value in self._step_buffer.items() + for key, value in self._shared_state.step_buffer.items() if key.startswith("costs/") } result.update(self._compute_rollups(cost_metrics)) @@ -246,19 +238,21 @@ async def flush(self, step: int) -> dict[str, float]: if section not in _HIERARCHICAL_SECTIONS: continue cum_key = f"{key}_cum" - next_value = self._cum_state.get(cum_key, 0.0) + value - self._cum_state[cum_key] = next_value + next_value = self._shared_state.cum_state.get(cum_key, 0.0) + value + self._shared_state.cum_state[cum_key] = next_value result[cum_key] = next_value - if self._pending_scenario_ids: - self._unique_scenario_ids.update(self._pending_scenario_ids) + if self._shared_state.pending_scenario_ids: + self._shared_state.unique_scenario_ids.update( + self._shared_state.pending_scenario_ids + ) result["data/cum_num_unique_scenarios"] = float( - len(self._unique_scenario_ids) + len(self._shared_state.unique_scenario_ids) ) self._update_throughput_metrics(result) - self._step_buffer.clear() - self._pending_scenario_ids.clear() + self._shared_state.step_buffer.clear() + self._shared_state.pending_scenario_ids.clear() return result def activate(self) -> Token["MetricsBuilder"]: @@ -293,7 +287,7 @@ def register_cost_extractor( normalized_provider = _normalize_provider(provider) if normalized_provider is None: raise ValueError("provider must be non-empty") - self._cost_extractors[normalized_provider] = extractor + self._shared_state.cost_extractors[normalized_provider] = extractor def register_token_pricing( self, @@ -305,15 +299,15 @@ def register_token_pricing( normalized_provider = _normalize_provider(provider) if normalized_provider is None: raise ValueError("provider must be non-empty") - self._token_pricing[normalized_provider] = TokenPricing( + self._shared_state.token_pricing[normalized_provider] = TokenPricing( prompt_per_million=float(prompt_per_million), completion_per_million=float(completion_per_million), ) def state_dict(self) -> dict[str, Any]: return { - "cum_state": dict(self._cum_state), - "unique_scenario_ids": list(self._unique_scenario_ids), + "cum_state": dict(self._shared_state.cum_state), + "unique_scenario_ids": list(self._shared_state.unique_scenario_ids), } def load_state_dict(self, state: dict[str, Any]) -> None: @@ -328,19 +322,13 @@ def load_state_dict(self, state: dict[str, Any]) -> None: self._shared_state.unique_scenario_ids.update(restored_unique_ids) self._shared_state.pending_scenario_ids.clear() - # Keep local references aligned with the shared state so derived builders - # created before or after resume observe the same cumulative state. - self._cum_state = self._shared_state.cum_state - self._unique_scenario_ids = self._shared_state.unique_scenario_ids - self._pending_scenario_ids = self._shared_state.pending_scenario_ids - def _validate_and_add(self, key: str, value: float) -> None: if key.endswith("_cum"): raise ValueError( f"Metric key '{key}' ends with '_cum', which is reserved for cumulative metrics." ) - for existing_key in self._step_buffer: + for existing_key in self._shared_state.step_buffer: if existing_key == key: continue if existing_key.startswith(f"{key}/"): @@ -352,10 +340,14 @@ def _validate_and_add(self, key: str, value: float) -> None: f"Cannot log '{key}' as a leaf: '{existing_key}' is already a leaf ancestor." ) - self._step_buffer[key] = self._step_buffer.get(key, 0.0) + value + self._shared_state.step_buffer[key] = ( + self._shared_state.step_buffer.get(key, 0.0) + value + ) def _validate_hierarchy(self) -> None: - keys = sorted(k for k in self._step_buffer if k.startswith("costs/")) + keys = sorted( + k for k in self._shared_state.step_buffer if k.startswith("costs/") + ) for i, key in enumerate(keys): for other in keys[i + 1 :]: if other.startswith(f"{key}/"): @@ -396,16 +388,22 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None: for step_key, cum_key in _THROUGHPUT_IDLE_MAPPINGS.items(): if step_key not in result: continue - next_value = self._cum_state.get(cum_key, 0.0) + result[step_key] - self._cum_state[cum_key] = next_value + next_value = ( + self._shared_state.cum_state.get(cum_key, 0.0) + result[step_key] + ) + self._shared_state.cum_state[cum_key] = next_value result[cum_key] = next_value if ( "data/step_trainer_tokens" in result or "time/step_trainer_s" in result ): - trainer_tokens = self._cum_state.get("data/step_trainer_tokens_cum") - trainer_seconds = self._cum_state.get("time/step_trainer_s_cum") + trainer_tokens = self._shared_state.cum_state.get( + "data/step_trainer_tokens_cum" + ) + trainer_seconds = self._shared_state.cum_state.get( + "time/step_trainer_s_cum" + ) if ( trainer_tokens is not None and trainer_seconds is not None @@ -416,8 +414,10 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None: ) if "data/step_actor_tokens" in result or "time/step_actor_s" in result: - actor_tokens = self._cum_state.get("data/step_actor_tokens_cum") - actor_seconds = self._cum_state.get("time/step_actor_s_cum") + actor_tokens = self._shared_state.cum_state.get( + "data/step_actor_tokens_cum" + ) + actor_seconds = self._shared_state.cum_state.get("time/step_actor_s_cum") if ( actor_tokens is not None and actor_seconds is not None @@ -433,9 +433,9 @@ def _resolve_token_pricing( completion_price_per_million: float | None = None, ) -> TokenPricing: normalized_provider = _normalize_provider(provider) or _DEFAULT_PROVIDER - default_pricing = self._token_pricing.get( + default_pricing = self._shared_state.token_pricing.get( normalized_provider, - self._token_pricing[_DEFAULT_PROVIDER], + self._shared_state.token_pricing[_DEFAULT_PROVIDER], ) return TokenPricing( prompt_per_million=( @@ -460,7 +460,7 @@ def _extract_api_cost( ) -> float | None: provider_name = _normalize_provider(provider) or _detect_provider(response) if provider_name is not None: - custom_extractor = self._cost_extractors.get(provider_name) + custom_extractor = self._shared_state.cost_extractors.get(provider_name) if custom_extractor is not None: custom_cost = custom_extractor(response) if custom_cost is not None: diff --git a/src/art/model.py b/src/art/model.py index 625408d1..ff0d75e3 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -636,7 +636,7 @@ async def log( metrics_without_costs = self._extract_non_cost_metrics(metrics, split) if metrics_without_costs: self._log_metrics(metrics_without_costs, split, step) - costs = await self._metrics_builder.flush(step) + costs = await self._metrics_builder.flush() if costs: self._log_metrics(costs, split, step) self._persist_metrics_builder_state() @@ -742,7 +742,7 @@ async def log( self._log_metrics(averages, split, step) # 4. Log cumulative costs - costs = await self._metrics_builder.flush(step) + costs = await self._metrics_builder.flush() if costs: self._log_metrics(costs, split, step) self._persist_metrics_builder_state() diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py index 1746c8f2..1ef7cbae 100644 --- a/tests/unit/test_metrics_builder.py +++ b/tests/unit/test_metrics_builder.py @@ -15,7 +15,7 @@ async def test_rollup_correctness_across_depths(self) -> None: builder.add_cost("train/tinker_inference", usd=0.45) builder.add_cost("eval/llm_judge/correctness", usd=0.06) - metrics = await builder.flush(step=1) + metrics = await builder.flush() assert metrics["costs/train/llm_judge"] == pytest.approx(0.12) assert metrics["costs/train"] == pytest.approx(1.77) @@ -35,7 +35,7 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None: step_actor_tokens=10, scenario_ids=["a", "b"], ) - first = await builder.flush(step=1) + first = await builder.flush() assert first["time/step_wall_s_cum"] == pytest.approx(1.5) assert first["time/step_actor_s_cum"] == pytest.approx(0.3) @@ -49,7 +49,7 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None: step_actor_tokens=5, scenario_ids=["b", "c"], ) - second = await builder.flush(step=2) + second = await builder.flush() assert second["time/step_wall_s_cum"] == pytest.approx(2.0) assert second["time/step_actor_s_cum"] == pytest.approx(0.5) @@ -68,7 +68,7 @@ async def test_helper_metrics_accumulate_within_a_single_step(self) -> None: builder.add_idle_times(step_trainer_idle_s=1.0, step_actor_idle_s=2.0) builder.add_idle_times(step_trainer_idle_s=0.5, step_actor_idle_s=1.0) - metrics = await builder.flush(step=1) + metrics = await builder.flush() assert metrics["data/step_num_scenarios"] == pytest.approx(5) assert metrics["data/step_actor_tokens"] == pytest.approx(15) @@ -88,7 +88,7 @@ async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) - builder.add_metric("data/step_actor_tokens", 10.0) builder.add_idle_times(step_trainer_idle_s=1.5, step_actor_idle_s=0.5) - metrics = await builder.flush(step=1) + metrics = await builder.flush() assert metrics["throughput/cum_trainer_idle_s"] == pytest.approx(1.5) assert metrics["throughput/cum_actor_idle_s"] == pytest.approx(0.5) @@ -99,13 +99,13 @@ async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) - async def test_costs_all_generated_for_single_and_multiple_children(self) -> None: single = MetricsBuilder(cost_context="train") single.add_cost("train/gpu", usd=2.0) - one = await single.flush(step=1) + one = await single.flush() assert one["costs/all"] == pytest.approx(2.0) multi = MetricsBuilder(cost_context="train") multi.add_cost("train/gpu", usd=2.0) multi.add_cost("eval/llm_judge/correctness", usd=0.5) - two = await multi.flush(step=1) + two = await multi.flush() assert two["costs/all"] == pytest.approx(2.5) def test_leaf_parent_conflicts_raise(self) -> None: @@ -125,7 +125,7 @@ async def test_duplicate_leaf_writes_are_summed(self) -> None: builder.add_cost("train/gpu", usd=1.25) builder.add_cost("train/gpu", usd=0.75) - metrics = await builder.flush(step=1) + metrics = await builder.flush() assert metrics["costs/train/gpu"] == pytest.approx(2.0) assert metrics["costs/train"] == pytest.approx(2.0) @@ -140,14 +140,14 @@ def test_cum_suffix_is_reserved(self) -> None: async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None: builder = MetricsBuilder(cost_context="train") builder.add_cost("train/gpu", usd=1.0) - first = await builder.flush(step=1) + first = await builder.flush() assert first["costs/train_cum"] == pytest.approx(1.0) - second = await builder.flush(step=2) + second = await builder.flush() assert not any(key.startswith("costs/") for key in second) builder.add_cost("train/gpu", usd=2.0) - third = await builder.flush(step=3) + third = await builder.flush() assert third["costs/train"] == pytest.approx(2.0) assert third["costs/train_cum"] == pytest.approx(3.0) @@ -155,14 +155,14 @@ async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None: async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None: before = MetricsBuilder(cost_context="train") before.add_cost("train/gpu", usd=1.0) - await before.flush(step=1) + await before.flush() state = before.state_dict() after = MetricsBuilder(cost_context="train") after.load_state_dict(state) after.add_cost("train/gpu", usd=2.0) - metrics = await after.flush(step=2) + metrics = await after.flush() assert metrics["costs/train_cum"] == pytest.approx(3.0) assert metrics["costs/all_cum"] == pytest.approx(3.0) @@ -170,7 +170,7 @@ async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None: async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None: before = MetricsBuilder(cost_context="train") before.add_cost("train/gpu", usd=1.0) - await before.flush(step=1) + await before.flush() after = MetricsBuilder(cost_context="train") after.load_state_dict(before.state_dict()) @@ -178,7 +178,7 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None: eval_builder = after.for_cost_context("eval") eval_builder.add_cost("eval/judge", usd=2.0) - metrics = await eval_builder.flush(step=2) + metrics = await eval_builder.flush() assert metrics["costs/eval/judge"] == pytest.approx(2.0) assert metrics["costs/all_cum"] == pytest.approx(3.0) @@ -186,11 +186,11 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None: async def test_unique_scenario_count_tracks_exact_ids(self) -> None: builder = MetricsBuilder(cost_context="train") builder.add_data(scenario_ids=["s1", "s2", "s3"]) - first = await builder.flush(step=1) + first = await builder.flush() assert first["data/cum_num_unique_scenarios"] == 3 builder.add_data(scenario_ids=["s2", "s4"]) - second = await builder.flush(step=2) + second = await builder.flush() assert second["data/cum_num_unique_scenarios"] == 4 @pytest.mark.asyncio @@ -200,11 +200,11 @@ async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None: builder.add_metric("data/step_trainer_tokens", 20.0) builder.add_data(scenario_ids=["s1"]) - first = await builder.flush(step=1) + first = await builder.flush() assert first["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) assert first["data/cum_num_unique_scenarios"] == 1 - second = await builder.flush(step=2) + second = await builder.flush() assert second == {} @pytest.mark.asyncio @@ -217,7 +217,7 @@ async def worker() -> None: await asyncio.sleep(0) await asyncio.gather(*(worker() for _ in range(4))) - metrics = await builder.flush(step=1) + metrics = await builder.flush() assert metrics["costs/train/gpu"] == pytest.approx(10.0) assert metrics["costs/all"] == pytest.approx(10.0) diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py index 09915d12..897a57d2 100644 --- a/tests/unit/test_track_api_cost.py +++ b/tests/unit/test_track_api_cost.py @@ -52,7 +52,7 @@ async def _judge() -> _OpenAIResponse: finally: token.var.reset(token) - metrics = await builder.flush(step=1) + metrics = await builder.flush() assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002) @pytest.mark.asyncio @@ -74,7 +74,7 @@ async def _judge() -> _AnthropicResponse: finally: token.var.reset(token) - metrics = await builder.flush(step=1) + metrics = await builder.flush() assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062) @pytest.mark.asyncio @@ -97,7 +97,7 @@ async def _judge() -> _OpenAIResponse: finally: token.var.reset(token) - metrics = await builder.flush(step=1) + metrics = await builder.flush() assert metrics["costs/train/llm_judge/custom"] == pytest.approx(0.75) @pytest.mark.asyncio @@ -129,7 +129,7 @@ async def _judge() -> _OpenAIResponse: finally: token.var.reset(token) - metrics = await builder.flush(step=1) + metrics = await builder.flush() assert metrics["costs/eval/llm_judge/correctness"] == pytest.approx(0.0002) From 9ee7a5968114aabfb98d7e27e0ac7d16c06f34c0 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 14:56:34 -0700 Subject: [PATCH 27/46] refactor: Extract API cost tracking helpers --- src/art/metrics.py | 288 ++++------------------------- src/art/metrics_api_cost.py | 252 +++++++++++++++++++++++++ tests/unit/test_metrics_builder.py | 18 ++ 3 files changed, 309 insertions(+), 249 deletions(-) create mode 100644 src/art/metrics_api_cost.py diff --git a/src/art/metrics.py b/src/art/metrics.py index 809d5061..bb23f510 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -1,16 +1,19 @@ from __future__ import annotations import asyncio -from collections.abc import Callable from contextlib import contextmanager from contextvars import ContextVar, Token from dataclasses import dataclass -from functools import wraps -from inspect import iscoroutinefunction import time -from typing import Any, ParamSpec, TypeVar +from typing import Any -from .costs import tokens_to_cost +from .metrics_api_cost import ( + DEFAULT_TOKEN_PRICING, + CostExtractor, + TokenPricing, + extract_api_cost, + normalize_provider, +) _active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder") @@ -19,30 +22,6 @@ "throughput/step_trainer_idle_s": "throughput/cum_trainer_idle_s", "throughput/step_actor_idle_s": "throughput/cum_actor_idle_s", } -_DEFAULT_PROVIDER = "openai" -_OPENAI_PROVIDER = "openai" -_ANTHROPIC_PROVIDER = "anthropic" - -P = ParamSpec("P") -R = TypeVar("R") - - -CostExtractor = Callable[[Any], float | None] -ResponseGetter = Callable[[Any], Any] - - -@dataclass(frozen=True) -class TokenPricing: - prompt_per_million: float - completion_per_million: float - - -_DEFAULT_TOKEN_PRICING = { - _OPENAI_PROVIDER: TokenPricing(prompt_per_million=2.5, completion_per_million=10.0), - _ANTHROPIC_PROVIDER: TokenPricing( - prompt_per_million=3.0, completion_per_million=15.0 - ), -} @dataclass @@ -64,83 +43,7 @@ def _new_shared_metrics_state() -> _SharedMetricsState: unique_scenario_ids=set(), pending_scenario_ids=set(), cost_extractors={}, - token_pricing=dict(_DEFAULT_TOKEN_PRICING), - ) - - -def _normalize_provider(provider: str | None) -> str | None: - if provider is None: - return None - normalized = provider.strip().lower() - if not normalized: - return None - return normalized - - -def _read_usage_field(usage: Any, field: str) -> float | None: - if usage is None: - return None - if isinstance(usage, dict): - value = usage.get(field) - else: - value = getattr(usage, field, None) - if value is None: - return None - return float(value) - - -def _response_usage(response: Any) -> Any: - if isinstance(response, dict): - return response.get("usage") - return getattr(response, "usage", None) - - -def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None: - usage = _response_usage(response) - prompt_tokens = _read_usage_field(usage, "prompt_tokens") - completion_tokens = _read_usage_field(usage, "completion_tokens") - if prompt_tokens is None and completion_tokens is None: - return None - return prompt_tokens or 0.0, completion_tokens or 0.0 - - -def _extract_anthropic_token_counts(response: Any) -> tuple[float, float] | None: - usage = _response_usage(response) - input_tokens = _read_usage_field(usage, "input_tokens") - output_tokens = _read_usage_field(usage, "output_tokens") - if input_tokens is None and output_tokens is None: - return None - return input_tokens or 0.0, output_tokens or 0.0 - - -def _detect_provider(response: Any) -> str | None: - usage = _response_usage(response) - if usage is None: - return None - - if ( - _read_usage_field(usage, "prompt_tokens") is not None - or _read_usage_field(usage, "completion_tokens") is not None - ): - return _OPENAI_PROVIDER - if ( - _read_usage_field(usage, "input_tokens") is not None - or _read_usage_field(usage, "output_tokens") is not None - ): - return _ANTHROPIC_PROVIDER - return None - - -def _estimate_cost( - token_counts: tuple[float, float] | None, - pricing: TokenPricing, -) -> float | None: - if token_counts is None: - return None - prompt_tokens, completion_tokens = token_counts - return tokens_to_cost(prompt_tokens, pricing.prompt_per_million) + tokens_to_cost( - completion_tokens, - pricing.completion_per_million, + token_pricing=dict(DEFAULT_TOKEN_PRICING), ) @@ -167,6 +70,33 @@ def add_cost(self, path: str, usd: float) -> None: full_key = f"costs/{path}" self.add_metric(full_key, float(usd)) + def add_response_cost( + self, + source: str, + response: Any, + *, + provider: str | None = None, + prompt_price_per_million: float | None = None, + completion_price_per_million: float | None = None, + ) -> float | None: + normalized_source = source.strip("/") + if not normalized_source: + raise ValueError("source must be non-empty") + + cost = extract_api_cost( + response, + provider=provider, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + cost_extractors=self._shared_state.cost_extractors, + token_pricing=self._shared_state.token_pricing, + ) + if cost is None: + return None + + self.add_cost(f"{self.cost_context}/{normalized_source}", cost) + return cost + def add_metric(self, key: str, value: float) -> None: if "/" not in key: raise ValueError("Metric key must include a section prefix") @@ -284,7 +214,7 @@ def for_cost_context(self, cost_context: str) -> "MetricsBuilder": def register_cost_extractor( self, provider: str, extractor: CostExtractor ) -> None: - normalized_provider = _normalize_provider(provider) + normalized_provider = normalize_provider(provider) if normalized_provider is None: raise ValueError("provider must be non-empty") self._shared_state.cost_extractors[normalized_provider] = extractor @@ -296,7 +226,7 @@ def register_token_pricing( prompt_per_million: float, completion_per_million: float, ) -> None: - normalized_provider = _normalize_provider(provider) + normalized_provider = normalize_provider(provider) if normalized_provider is None: raise ValueError("provider must be non-empty") self._shared_state.token_pricing[normalized_provider] = TokenPricing( @@ -425,145 +355,5 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None: ): result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds - def _resolve_token_pricing( - self, - provider: str | None, - *, - prompt_price_per_million: float | None = None, - completion_price_per_million: float | None = None, - ) -> TokenPricing: - normalized_provider = _normalize_provider(provider) or _DEFAULT_PROVIDER - default_pricing = self._shared_state.token_pricing.get( - normalized_provider, - self._shared_state.token_pricing[_DEFAULT_PROVIDER], - ) - return TokenPricing( - prompt_per_million=( - float(prompt_price_per_million) - if prompt_price_per_million is not None - else default_pricing.prompt_per_million - ), - completion_per_million=( - float(completion_price_per_million) - if completion_price_per_million is not None - else default_pricing.completion_per_million - ), - ) - - def _extract_api_cost( - self, - response: Any, - *, - provider: str | None = None, - prompt_price_per_million: float | None = None, - completion_price_per_million: float | None = None, - ) -> float | None: - provider_name = _normalize_provider(provider) or _detect_provider(response) - if provider_name is not None: - custom_extractor = self._shared_state.cost_extractors.get(provider_name) - if custom_extractor is not None: - custom_cost = custom_extractor(response) - if custom_cost is not None: - return float(custom_cost) - - token_pricing = self._resolve_token_pricing( - provider_name, - prompt_price_per_million=prompt_price_per_million, - completion_price_per_million=completion_price_per_million, - ) - if provider_name == _OPENAI_PROVIDER: - return _estimate_cost( - _extract_openai_token_counts(response), - token_pricing, - ) - if provider_name == _ANTHROPIC_PROVIDER: - return _estimate_cost( - _extract_anthropic_token_counts(response), - token_pricing, - ) - - token_pricing = self._resolve_token_pricing( - provider_name, - prompt_price_per_million=prompt_price_per_million, - completion_price_per_million=completion_price_per_million, - ) - token_counts = _extract_openai_token_counts(response) - if token_counts is None: - token_counts = _extract_anthropic_token_counts(response) - return _estimate_cost(token_counts, token_pricing) - - -def _record_api_cost( - *, - result: Any, - source: str, - provider: str | None, - response_getter: ResponseGetter | None, - prompt_price_per_million: float | None, - completion_price_per_million: float | None, -) -> None: - try: - builder = MetricsBuilder.get_active() - except LookupError: - return - - response = response_getter(result) if response_getter is not None else result - cost = builder._extract_api_cost( - response, - provider=provider, - prompt_price_per_million=prompt_price_per_million, - completion_price_per_million=completion_price_per_million, - ) - if cost is None: - return - builder.add_cost(f"{builder.cost_context}/{source}", cost) - - -def track_api_cost( - *, - source: str, - provider: str | None = None, - response_getter: ResponseGetter | None = None, - prompt_price_per_million: float | None = None, - completion_price_per_million: float | None = None, -) -> Callable[[Callable[P, R]], Callable[P, R]]: - normalized_source = source.strip("/") - if not normalized_source: - raise ValueError("source must be non-empty") - - normalized_provider = _normalize_provider(provider) - - def _decorate(func: Callable[P, R]) -> Callable[P, R]: - if iscoroutinefunction(func): - - @wraps(func) - async def _async_wrapper(*args: P.args, **kwargs: P.kwargs): - result = await func(*args, **kwargs) - _record_api_cost( - result=result, - source=normalized_source, - provider=normalized_provider, - response_getter=response_getter, - prompt_price_per_million=prompt_price_per_million, - completion_price_per_million=completion_price_per_million, - ) - return result - - return _async_wrapper - - @wraps(func) - def _sync_wrapper(*args: P.args, **kwargs: P.kwargs): - result = func(*args, **kwargs) - _record_api_cost( - result=result, - source=normalized_source, - provider=normalized_provider, - response_getter=response_getter, - prompt_price_per_million=prompt_price_per_million, - completion_price_per_million=completion_price_per_million, - ) - return result - - return _sync_wrapper - return _decorate +from .metrics_api_cost import track_api_cost diff --git a/src/art/metrics_api_cost.py b/src/art/metrics_api_cost.py new file mode 100644 index 00000000..f9a8d3eb --- /dev/null +++ b/src/art/metrics_api_cost.py @@ -0,0 +1,252 @@ +from __future__ import annotations + +from collections.abc import Callable, Mapping +from dataclasses import dataclass +from functools import wraps +from inspect import iscoroutinefunction +from typing import Any, ParamSpec, TypeVar + +from .costs import tokens_to_cost + +DEFAULT_PROVIDER = "openai" +OPENAI_PROVIDER = "openai" +ANTHROPIC_PROVIDER = "anthropic" + +P = ParamSpec("P") +R = TypeVar("R") + +CostExtractor = Callable[[Any], float | None] +ResponseGetter = Callable[[Any], Any] + + +@dataclass(frozen=True) +class TokenPricing: + prompt_per_million: float + completion_per_million: float + + +DEFAULT_TOKEN_PRICING = { + OPENAI_PROVIDER: TokenPricing(prompt_per_million=2.5, completion_per_million=10.0), + ANTHROPIC_PROVIDER: TokenPricing( + prompt_per_million=3.0, + completion_per_million=15.0, + ), +} + + +def normalize_provider(provider: str | None) -> str | None: + if provider is None: + return None + normalized = provider.strip().lower() + if not normalized: + return None + return normalized + + +def _read_usage_field(usage: Any, field: str) -> float | None: + if usage is None: + return None + if isinstance(usage, dict): + value = usage.get(field) + else: + value = getattr(usage, field, None) + if value is None: + return None + return float(value) + + +def _response_usage(response: Any) -> Any: + if isinstance(response, dict): + return response.get("usage") + return getattr(response, "usage", None) + + +def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None: + usage = _response_usage(response) + prompt_tokens = _read_usage_field(usage, "prompt_tokens") + completion_tokens = _read_usage_field(usage, "completion_tokens") + if prompt_tokens is None and completion_tokens is None: + return None + return prompt_tokens or 0.0, completion_tokens or 0.0 + + +def _extract_anthropic_token_counts(response: Any) -> tuple[float, float] | None: + usage = _response_usage(response) + input_tokens = _read_usage_field(usage, "input_tokens") + output_tokens = _read_usage_field(usage, "output_tokens") + if input_tokens is None and output_tokens is None: + return None + return input_tokens or 0.0, output_tokens or 0.0 + + +def _detect_provider(response: Any) -> str | None: + usage = _response_usage(response) + if usage is None: + return None + + if ( + _read_usage_field(usage, "prompt_tokens") is not None + or _read_usage_field(usage, "completion_tokens") is not None + ): + return OPENAI_PROVIDER + if ( + _read_usage_field(usage, "input_tokens") is not None + or _read_usage_field(usage, "output_tokens") is not None + ): + return ANTHROPIC_PROVIDER + return None + + +def _estimate_cost( + token_counts: tuple[float, float] | None, + pricing: TokenPricing, +) -> float | None: + if token_counts is None: + return None + prompt_tokens, completion_tokens = token_counts + return tokens_to_cost(prompt_tokens, pricing.prompt_per_million) + tokens_to_cost( + completion_tokens, + pricing.completion_per_million, + ) + + +def _resolve_token_pricing( + provider: str | None, + *, + prompt_price_per_million: float | None, + completion_price_per_million: float | None, + token_pricing: Mapping[str, TokenPricing], +) -> TokenPricing: + normalized_provider = normalize_provider(provider) or DEFAULT_PROVIDER + default_pricing = token_pricing.get( + normalized_provider, + token_pricing[DEFAULT_PROVIDER], + ) + return TokenPricing( + prompt_per_million=( + float(prompt_price_per_million) + if prompt_price_per_million is not None + else default_pricing.prompt_per_million + ), + completion_per_million=( + float(completion_price_per_million) + if completion_price_per_million is not None + else default_pricing.completion_per_million + ), + ) + + +def extract_api_cost( + response: Any, + *, + provider: str | None, + prompt_price_per_million: float | None, + completion_price_per_million: float | None, + cost_extractors: Mapping[str, CostExtractor], + token_pricing: Mapping[str, TokenPricing], +) -> float | None: + provider_name = normalize_provider(provider) or _detect_provider(response) + if provider_name is not None: + custom_extractor = cost_extractors.get(provider_name) + if custom_extractor is not None: + custom_cost = custom_extractor(response) + if custom_cost is not None: + return float(custom_cost) + + pricing = _resolve_token_pricing( + provider_name, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + token_pricing=token_pricing, + ) + if provider_name == OPENAI_PROVIDER: + return _estimate_cost(_extract_openai_token_counts(response), pricing) + if provider_name == ANTHROPIC_PROVIDER: + return _estimate_cost(_extract_anthropic_token_counts(response), pricing) + + pricing = _resolve_token_pricing( + provider_name, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + token_pricing=token_pricing, + ) + token_counts = _extract_openai_token_counts(response) + if token_counts is None: + token_counts = _extract_anthropic_token_counts(response) + return _estimate_cost(token_counts, pricing) + + +def _record_api_cost( + *, + result: Any, + source: str, + provider: str | None, + response_getter: ResponseGetter | None, + prompt_price_per_million: float | None, + completion_price_per_million: float | None, +) -> None: + try: + from .metrics import MetricsBuilder + + builder = MetricsBuilder.get_active() + except LookupError: + return + + response = response_getter(result) if response_getter is not None else result + builder.add_response_cost( + source, + response, + provider=provider, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + ) + + +def track_api_cost( + *, + source: str, + provider: str | None = None, + response_getter: ResponseGetter | None = None, + prompt_price_per_million: float | None = None, + completion_price_per_million: float | None = None, +) -> Callable[[Callable[P, R]], Callable[P, R]]: + normalized_source = source.strip("/") + if not normalized_source: + raise ValueError("source must be non-empty") + + normalized_provider = normalize_provider(provider) + + def _decorate(func: Callable[P, R]) -> Callable[P, R]: + if iscoroutinefunction(func): + + @wraps(func) + async def _async_wrapper(*args: P.args, **kwargs: P.kwargs): + result = await func(*args, **kwargs) + _record_api_cost( + result=result, + source=normalized_source, + provider=normalized_provider, + response_getter=response_getter, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + ) + return result + + return _async_wrapper + + @wraps(func) + def _sync_wrapper(*args: P.args, **kwargs: P.kwargs): + result = func(*args, **kwargs) + _record_api_cost( + result=result, + source=normalized_source, + provider=normalized_provider, + response_getter=response_getter, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + ) + return result + + return _sync_wrapper + + return _decorate diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py index 1ef7cbae..cef84184 100644 --- a/tests/unit/test_metrics_builder.py +++ b/tests/unit/test_metrics_builder.py @@ -182,6 +182,24 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None: assert metrics["costs/eval/judge"] == pytest.approx(2.0) assert metrics["costs/all_cum"] == pytest.approx(3.0) + @pytest.mark.asyncio + async def test_add_response_cost_uses_registered_pricing(self) -> None: + builder = MetricsBuilder(cost_context="eval") + builder.register_token_pricing( + "anthropic", + prompt_per_million=5.0, + completion_per_million=7.0, + ) + + cost = builder.add_response_cost( + "llm_judge/faithfulness", + {"usage": {"input_tokens": 40, "output_tokens": 60}}, + ) + + metrics = await builder.flush() + assert cost == pytest.approx(0.00062) + assert metrics["costs/eval/llm_judge/faithfulness"] == pytest.approx(0.00062) + @pytest.mark.asyncio async def test_unique_scenario_count_tracks_exact_ids(self) -> None: builder = MetricsBuilder(cost_context="train") From 29b7836e5062e1c885b2bbb992a91978b4b11b6b Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 15:27:36 -0700 Subject: [PATCH 28/46] fix: Simplify Metrics Logging And Cumulative Naming --- docs/metrics-taxonomy.md | 8 ++--- src/art/metrics.py | 39 ++++++++++++++++------ src/art/metrics_taxonomy.py | 24 ++++++++++++-- src/art/model.py | 40 +++++++++++------------ tests/unit/test_frontend_logging.py | 44 ++++++++++++------------- tests/unit/test_metric_routing.py | 4 +-- tests/unit/test_metrics_builder.py | 50 ++++++++++++++--------------- tests/unit/test_metrics_taxonomy.py | 11 +++++++ tests/unit/test_track_api_cost.py | 2 +- 9 files changed, 135 insertions(+), 87 deletions(-) diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md index 300bdf0b..38c12d1d 100644 --- a/docs/metrics-taxonomy.md +++ b/docs/metrics-taxonomy.md @@ -45,7 +45,7 @@ Cost leaves can be logged with either: ART rolls costs up automatically: - parent rollups (for example `costs/train`, `costs/all`) -- cumulative keys with `_cum` suffix (for example `costs/all_cum`) +- cumulative keys under the `cum/` namespace (for example `costs/cum/all`) ## Metrics Added By ART @@ -58,9 +58,9 @@ ART now emits the following metrics from library internals where the data is ava - `time/step_wall_s`, `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer` - `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted` - `data/step_num_groups_trainable` for train splits -- `data/cum_num_unique_scenarios` when scenario IDs are present in group or trajectory metadata +- `data/cum/num_unique_scenarios` when scenario IDs are present in group or trajectory metadata - `data/step_trainer_tokens` where the backend knows the trainer token count -- `throughput/cum_trainer_idle_s`, `throughput/cum_actor_idle_s` +- `throughput/cum/trainer_idle_s`, `throughput/cum/actor_idle_s` - `throughput/avg_trainer_tok_per_s`, `throughput/avg_actor_tok_per_s` when both token and time inputs are available Some metrics remain user-owned because ART cannot infer them reliably for every workflow, especially actor token usage outside the pipeline trainer. @@ -135,7 +135,7 @@ The next `model.log(...)` flush for that step will include: - `costs/train/llm_judge/correctness` (or `costs/eval/...`) - hierarchical rollups like `costs/train`, `costs/all` -- cumulative keys like `costs/all_cum` +- cumulative keys like `costs/cum/all` Built-in providers: diff --git a/src/art/metrics.py b/src/art/metrics.py index bb23f510..0e9846d8 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -19,11 +19,30 @@ _HIERARCHICAL_SECTIONS = {"costs", "time", "data"} _THROUGHPUT_IDLE_MAPPINGS = { - "throughput/step_trainer_idle_s": "throughput/cum_trainer_idle_s", - "throughput/step_actor_idle_s": "throughput/cum_actor_idle_s", + "throughput/step_trainer_idle_s": "throughput/cum/trainer_idle_s", + "throughput/step_actor_idle_s": "throughput/cum/actor_idle_s", } +def is_cumulative_metric_key(key: str) -> bool: + parts = key.split("/", 2) + return len(parts) >= 2 and parts[1] == "cum" + + +def is_builder_managed_metric(key: str) -> bool: + return key.startswith(("costs/", "time/step_", "data/step_", "throughput/step_")) + + +def to_cumulative_metric_key(key: str) -> str: + if is_cumulative_metric_key(key): + raise ValueError(f"Metric key '{key}' is already cumulative.") + + section, rest = key.split("/", 1) + if rest.startswith("step_"): + rest = rest[len("step_") :] + return f"{section}/cum/{rest}" + + @dataclass class _SharedMetricsState: lock: asyncio.Lock @@ -167,7 +186,7 @@ async def flush(self) -> dict[str, float]: section = key.split("/", 1)[0] if section not in _HIERARCHICAL_SECTIONS: continue - cum_key = f"{key}_cum" + cum_key = to_cumulative_metric_key(key) next_value = self._shared_state.cum_state.get(cum_key, 0.0) + value self._shared_state.cum_state[cum_key] = next_value result[cum_key] = next_value @@ -176,7 +195,7 @@ async def flush(self) -> dict[str, float]: self._shared_state.unique_scenario_ids.update( self._shared_state.pending_scenario_ids ) - result["data/cum_num_unique_scenarios"] = float( + result["data/cum/num_unique_scenarios"] = float( len(self._shared_state.unique_scenario_ids) ) @@ -253,9 +272,9 @@ def load_state_dict(self, state: dict[str, Any]) -> None: self._shared_state.pending_scenario_ids.clear() def _validate_and_add(self, key: str, value: float) -> None: - if key.endswith("_cum"): + if is_cumulative_metric_key(key): raise ValueError( - f"Metric key '{key}' ends with '_cum', which is reserved for cumulative metrics." + f"Metric key '{key}' uses the reserved cumulative namespace." ) for existing_key in self._shared_state.step_buffer: @@ -329,10 +348,10 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None: or "time/step_trainer_s" in result ): trainer_tokens = self._shared_state.cum_state.get( - "data/step_trainer_tokens_cum" + "data/cum/trainer_tokens" ) trainer_seconds = self._shared_state.cum_state.get( - "time/step_trainer_s_cum" + "time/cum/trainer_s" ) if ( trainer_tokens is not None @@ -345,9 +364,9 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None: if "data/step_actor_tokens" in result or "time/step_actor_s" in result: actor_tokens = self._shared_state.cum_state.get( - "data/step_actor_tokens_cum" + "data/cum/actor_tokens" ) - actor_seconds = self._shared_state.cum_state.get("time/step_actor_s_cum") + actor_seconds = self._shared_state.cum_state.get("time/cum/actor_s") if ( actor_tokens is not None and actor_seconds is not None diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py index 061e0872..e4f9e713 100644 --- a/src/art/metrics_taxonomy.py +++ b/src/art/metrics_taxonomy.py @@ -33,6 +33,7 @@ "train_tokens": "data/step_trainer_tokens", "num_datums": "data/step_num_datums", } +_INVARIANT_METRIC_KEYS = frozenset({TRAIN_GRADIENT_STEPS_KEY}) def rename_train_metric_key(metric: str) -> str: @@ -48,13 +49,32 @@ def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]: def average_metric_samples(metric_samples: Iterable[dict[str, float]]) -> dict[str, float]: totals: dict[str, float] = {} counts: dict[str, int] = {} + invariant_values: dict[str, float] = {} for sample in metric_samples: for key, value in sample.items(): - totals[key] = totals.get(key, 0.0) + float(value) + numeric_value = float(value) + if key in _INVARIANT_METRIC_KEYS: + previous_value = invariant_values.get(key) + if previous_value is None: + invariant_values[key] = numeric_value + elif previous_value != numeric_value: + raise ValueError( + f"Metric '{key}' must be invariant across samples, " + f"got {previous_value} and {numeric_value}." + ) + + totals[key] = totals.get(key, 0.0) + numeric_value counts[key] = counts.get(key, 0) + 1 - return {key: totals[key] / counts[key] for key in totals} + return { + key: ( + invariant_values[key] + if key in _INVARIANT_METRIC_KEYS + else totals[key] / counts[key] + ) + for key in totals + } @dataclass(frozen=True) diff --git a/src/art/model.py b/src/art/model.py index ff0d75e3..aba894a0 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -15,7 +15,7 @@ from . import dev from .costs import CostCalculator -from .metrics import MetricsBuilder +from .metrics import MetricsBuilder, is_builder_managed_metric from .metrics_taxonomy import ( TRAIN_GRADIENT_STEPS_KEY, average_metric_samples, @@ -39,7 +39,6 @@ COSTS_METRIC_PREFIX = "costs_" COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total" METRICS_BUILDER_STATE_KEY = "_metrics_builder_state" -BUILDER_CUMULATIVE_PREFIXES = ("time/step_", "data/step_", "throughput/step_") METRIC_SECTIONS = frozenset( { "reward", @@ -493,7 +492,7 @@ def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None: wandb.define_metric(key, step_metric="training_step") self._wandb_defined_metrics.add(key) - def _extract_non_cost_metrics( + def _route_metrics_and_extract_non_costs( self, metrics: dict[str, float], split: str ) -> dict[str, float]: non_cost_metrics: dict[str, float] = {} @@ -515,7 +514,7 @@ def _extract_non_cost_metrics( f"{cost_context}/{component}", numeric_value ) continue - if metric.startswith(BUILDER_CUMULATIVE_PREFIXES): + if is_builder_managed_metric(metric): self._metrics_builder.add_metric(metric, numeric_value) continue non_cost_metrics[metric] = numeric_value @@ -633,12 +632,13 @@ async def log( # If only metrics provided (no trajectories), just log them and return if trajectories is None: if metrics is not None: - metrics_without_costs = self._extract_non_cost_metrics(metrics, split) - if metrics_without_costs: - self._log_metrics(metrics_without_costs, split, step) - costs = await self._metrics_builder.flush() - if costs: - self._log_metrics(costs, split, step) + metrics_without_costs = self._route_metrics_and_extract_non_costs( + metrics, split + ) + builder_metrics = await self._metrics_builder.flush() + merged_metrics = {**metrics_without_costs, **builder_metrics} + if merged_metrics: + self._log_metrics(merged_metrics, split, step) self._persist_metrics_builder_state() return @@ -676,7 +676,7 @@ async def log( for group in trajectory_groups: if group.metrics: - group_non_cost = self._extract_non_cost_metrics( + group_non_cost = self._route_metrics_and_extract_non_costs( cast(dict[str, float], group.metrics), split ) else: @@ -701,7 +701,7 @@ async def log( routed_metric = f"reward/{routed_metric}" trajectory_metrics[routed_metric] = float(value) - non_cost_trajectory_metrics = self._extract_non_cost_metrics( + non_cost_trajectory_metrics = self._route_metrics_and_extract_non_costs( trajectory_metrics, split, ) @@ -735,16 +735,16 @@ async def log( # Merge in any additional metrics passed directly if metrics is not None: - metrics_without_costs = self._extract_non_cost_metrics(metrics, split) + metrics_without_costs = self._route_metrics_and_extract_non_costs( + metrics, split + ) averages.update(metrics_without_costs) - # 3. Log metrics (writes to history.jsonl and wandb) - self._log_metrics(averages, split, step) - - # 4. Log cumulative costs - costs = await self._metrics_builder.flush() - if costs: - self._log_metrics(costs, split, step) + # 3. Merge in any builder-managed metrics and log a single row. + builder_metrics = await self._metrics_builder.flush() + merged_metrics = {**averages, **builder_metrics} + if merged_metrics: + self._log_metrics(merged_metrics, split, step) self._persist_metrics_builder_state() async def get_step(self) -> int: diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 7f689c42..755d7e64 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -227,9 +227,7 @@ async def test_history_appends_entries( history_path = tmp_path / "test-project/models/test-model/history.jsonl" df = pl.read_ndjson(str(history_path)) - # Each log call now emits the primary metrics row plus a taxonomy - # row for cumulative data/time metrics. - assert len(df) == 4 + assert len(df) == 2 # Check both splits are present columns = df.columns @@ -351,9 +349,9 @@ async def test_metric_prefixes(self, tmp_path: Path): "time/wall_clock_sec", ] ] - assert all(k.startswith("val/") for k in metric_keys), ( - f"Not all metrics prefixed: {metric_keys}" - ) + assert all( + k.startswith(("val/", "data/")) for k in metric_keys + ), f"Not all metrics routed into taxonomy namespaces: {metric_keys}" assert entry["training_step"] == 0 assert entry["time/wall_clock_sec"] >= 0 @@ -619,7 +617,7 @@ async def test_train_logs_add_default_data_metrics_from_trajectory_groups( assert merged["data/step_num_trajectories"] == pytest.approx(3.0) assert merged["data/step_num_groups_submitted"] == pytest.approx(2.0) assert merged["data/step_num_groups_trainable"] == pytest.approx(1.0) - assert merged["data/cum_num_unique_scenarios"] == pytest.approx(2.0) + assert merged["data/cum/num_unique_scenarios"] == pytest.approx(2.0) assert merged["train/num_groups_submitted"] == pytest.approx(2.0) assert merged["train/num_groups_trainable"] == pytest.approx(1.0) assert merged["train/num_trajectories"] == pytest.approx(3.0) @@ -660,12 +658,12 @@ async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path): assert first["costs/train/sample"] == pytest.approx(0.3) assert first["costs/train"] == pytest.approx(0.5) assert first["costs/all"] == pytest.approx(0.5) - assert first["costs/all_cum"] == pytest.approx(0.5) + assert first["costs/cum/all"] == pytest.approx(0.5) assert second["costs/train/prefill"] == pytest.approx(0.1) - assert second["costs/train/prefill_cum"] == pytest.approx(0.3) - assert second["costs/train_cum"] == pytest.approx(0.6) - assert second["costs/all_cum"] == pytest.approx(0.6) + assert second["costs/cum/train/prefill"] == pytest.approx(0.3) + assert second["costs/cum/train"] == pytest.approx(0.6) + assert second["costs/cum/all"] == pytest.approx(0.6) @pytest.mark.asyncio async def test_cost_cumulative_persists_across_model_recreation( @@ -702,9 +700,9 @@ async def test_cost_cumulative_persists_across_model_recreation( first = json.loads(f.readline()) second = json.loads(f.readline()) - assert first["costs/train/prefill_cum"] == pytest.approx(0.25) - assert second["costs/train/prefill_cum"] == pytest.approx(1.0) - assert second["costs/all_cum"] == pytest.approx(1.0) + assert first["costs/cum/train/prefill"] == pytest.approx(0.25) + assert second["costs/cum/train/prefill"] == pytest.approx(1.0) + assert second["costs/cum/all"] == pytest.approx(1.0) @pytest.mark.asyncio async def test_metrics_builder_loads_resume_state_before_builder_use( @@ -733,8 +731,8 @@ async def test_metrics_builder_loads_resume_state_before_builder_use( first = json.loads(f.readline()) second = json.loads(f.readline()) - assert first["data/cum_num_unique_scenarios"] == pytest.approx(1.0) - assert second["data/cum_num_unique_scenarios"] == pytest.approx(2.0) + assert first["data/cum/num_unique_scenarios"] == pytest.approx(1.0) + assert second["data/cum/num_unique_scenarios"] == pytest.approx(2.0) @pytest.mark.asyncio async def test_direct_time_and_data_metrics_get_cumulative_variants( @@ -762,9 +760,9 @@ async def test_direct_time_and_data_metrics_get_cumulative_variants( entry = json.loads(f.readline()) assert entry["time/step_actor_s"] == pytest.approx(1.5) - assert entry["time/step_actor_s_cum"] == pytest.approx(1.5) + assert entry["time/cum/actor_s"] == pytest.approx(1.5) assert entry["data/step_actor_tokens"] == pytest.approx(10) - assert entry["data/step_actor_tokens_cum"] == pytest.approx(10) + assert entry["data/cum/actor_tokens"] == pytest.approx(10) @pytest.mark.asyncio async def test_log_without_new_builder_metrics_skips_extra_taxonomy_row( @@ -799,10 +797,10 @@ async def test_log_without_new_builder_metrics_skips_extra_taxonomy_row( assert len(rows) == 2 assert rows[0]["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) - assert rows[0]["data/cum_num_unique_scenarios"] == pytest.approx(1.0) + assert rows[0]["data/cum/num_unique_scenarios"] == pytest.approx(1.0) assert rows[1]["loss/train"] == pytest.approx(1.0) assert "throughput/avg_trainer_tok_per_s" not in rows[1] - assert "data/cum_num_unique_scenarios" not in rows[1] + assert "data/cum/num_unique_scenarios" not in rows[1] class TestWandbIntegration: @@ -984,7 +982,7 @@ async def mock_train_sft(*args, **kwargs): with open(history_path) as f: lines = f.readlines() - assert len(lines) == 2, f"Expected 2 log entries, got {len(lines)}" + assert len(lines) == 1, f"Expected 1 log entry, got {len(lines)}" entries = [json.loads(line) for line in lines] merged: dict[str, float] = {} @@ -995,7 +993,7 @@ async def mock_train_sft(*args, **kwargs): assert merged["loss/train"] == pytest.approx(0.8) # (1.0 + 0.8 + 0.6) / 3 assert merged["loss/grad_norm"] == pytest.approx(0.4) # (0.5 + 0.4 + 0.3) / 3 assert merged["time/step_trainer_s"] >= 0 - assert merged["time/step_trainer_s_cum"] >= 0 + assert merged["time/cum/trainer_s"] >= 0 @pytest.mark.asyncio async def test_train_sft_single_step_increment(self, tmp_path: Path): @@ -1032,7 +1030,7 @@ async def mock_train_sft(*args, **kwargs): history_path = tmp_path / "test-project/models/test-sft-step/history.jsonl" df = pl.read_ndjson(str(history_path)) - assert len(df) == 2, "Should have exactly 2 log entries" + assert len(df) == 1, "Should have exactly 1 log entry" assert set(df["step"].to_list()) == {1}, "Step should be 1 (single increment)" @pytest.mark.asyncio diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py index 2587385d..d7dcd2b5 100644 --- a/tests/unit/test_metric_routing.py +++ b/tests/unit/test_metric_routing.py @@ -97,7 +97,7 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step( model._log_metrics( { "costs/train/sample": 0.1, - "costs/train/prefill_cum": 0.2, + "costs/cum/train/prefill": 0.2, }, split="train", step=1, @@ -109,6 +109,6 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step( ] assert (("costs/train/sample",), {"step_metric": "training_step"}) in define_calls assert ( - (("costs/train/prefill_cum",), {"step_metric": "training_step"}) + (("costs/cum/train/prefill",), {"step_metric": "training_step"}) in define_calls ) diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py index cef84184..d1df45e8 100644 --- a/tests/unit/test_metrics_builder.py +++ b/tests/unit/test_metrics_builder.py @@ -21,9 +21,9 @@ async def test_rollup_correctness_across_depths(self) -> None: assert metrics["costs/train"] == pytest.approx(1.77) assert metrics["costs/eval"] == pytest.approx(0.06) assert metrics["costs/all"] == pytest.approx(1.83) - assert metrics["costs/train/llm_judge_cum"] == pytest.approx(0.12) - assert metrics["costs/train_cum"] == pytest.approx(1.77) - assert metrics["costs/all_cum"] == pytest.approx(1.83) + assert metrics["costs/cum/train/llm_judge"] == pytest.approx(0.12) + assert metrics["costs/cum/train"] == pytest.approx(1.77) + assert metrics["costs/cum/all"] == pytest.approx(1.83) @pytest.mark.asyncio async def test_cum_accumulates_for_hierarchical_sections(self) -> None: @@ -37,11 +37,11 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None: ) first = await builder.flush() - assert first["time/step_wall_s_cum"] == pytest.approx(1.5) - assert first["time/step_actor_s_cum"] == pytest.approx(0.3) - assert first["data/step_num_scenarios_cum"] == pytest.approx(2) - assert first["data/step_actor_tokens_cum"] == pytest.approx(10) - assert first["data/cum_num_unique_scenarios"] == 2 + assert first["time/cum/wall_s"] == pytest.approx(1.5) + assert first["time/cum/actor_s"] == pytest.approx(0.3) + assert first["data/cum/num_scenarios"] == pytest.approx(2) + assert first["data/cum/actor_tokens"] == pytest.approx(10) + assert first["data/cum/num_unique_scenarios"] == 2 builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2) builder.add_data( @@ -51,11 +51,11 @@ async def test_cum_accumulates_for_hierarchical_sections(self) -> None: ) second = await builder.flush() - assert second["time/step_wall_s_cum"] == pytest.approx(2.0) - assert second["time/step_actor_s_cum"] == pytest.approx(0.5) - assert second["data/step_num_scenarios_cum"] == pytest.approx(5) - assert second["data/step_actor_tokens_cum"] == pytest.approx(15) - assert second["data/cum_num_unique_scenarios"] == 3 + assert second["time/cum/wall_s"] == pytest.approx(2.0) + assert second["time/cum/actor_s"] == pytest.approx(0.5) + assert second["data/cum/num_scenarios"] == pytest.approx(5) + assert second["data/cum/actor_tokens"] == pytest.approx(15) + assert second["data/cum/num_unique_scenarios"] == 3 @pytest.mark.asyncio async def test_helper_metrics_accumulate_within_a_single_step(self) -> None: @@ -90,8 +90,8 @@ async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) - metrics = await builder.flush() - assert metrics["throughput/cum_trainer_idle_s"] == pytest.approx(1.5) - assert metrics["throughput/cum_actor_idle_s"] == pytest.approx(0.5) + assert metrics["throughput/cum/trainer_idle_s"] == pytest.approx(1.5) + assert metrics["throughput/cum/actor_idle_s"] == pytest.approx(0.5) assert metrics["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) assert metrics["throughput/avg_actor_tok_per_s"] == pytest.approx(5.0) @@ -131,17 +131,17 @@ async def test_duplicate_leaf_writes_are_summed(self) -> None: assert metrics["costs/train"] == pytest.approx(2.0) assert metrics["costs/all"] == pytest.approx(2.0) - def test_cum_suffix_is_reserved(self) -> None: + def test_cumulative_namespace_is_reserved(self) -> None: builder = MetricsBuilder(cost_context="train") with pytest.raises(ValueError): - builder.add_cost("train/llm_judge_cum", usd=0.1) + builder.add_metric("costs/cum/train/llm_judge", 0.1) @pytest.mark.asyncio async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None: builder = MetricsBuilder(cost_context="train") builder.add_cost("train/gpu", usd=1.0) first = await builder.flush() - assert first["costs/train_cum"] == pytest.approx(1.0) + assert first["costs/cum/train"] == pytest.approx(1.0) second = await builder.flush() assert not any(key.startswith("costs/") for key in second) @@ -149,7 +149,7 @@ async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None: builder.add_cost("train/gpu", usd=2.0) third = await builder.flush() assert third["costs/train"] == pytest.approx(2.0) - assert third["costs/train_cum"] == pytest.approx(3.0) + assert third["costs/cum/train"] == pytest.approx(3.0) @pytest.mark.asyncio async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None: @@ -163,8 +163,8 @@ async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None: after.add_cost("train/gpu", usd=2.0) metrics = await after.flush() - assert metrics["costs/train_cum"] == pytest.approx(3.0) - assert metrics["costs/all_cum"] == pytest.approx(3.0) + assert metrics["costs/cum/train"] == pytest.approx(3.0) + assert metrics["costs/cum/all"] == pytest.approx(3.0) @pytest.mark.asyncio async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None: @@ -180,7 +180,7 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None: metrics = await eval_builder.flush() assert metrics["costs/eval/judge"] == pytest.approx(2.0) - assert metrics["costs/all_cum"] == pytest.approx(3.0) + assert metrics["costs/cum/all"] == pytest.approx(3.0) @pytest.mark.asyncio async def test_add_response_cost_uses_registered_pricing(self) -> None: @@ -205,11 +205,11 @@ async def test_unique_scenario_count_tracks_exact_ids(self) -> None: builder = MetricsBuilder(cost_context="train") builder.add_data(scenario_ids=["s1", "s2", "s3"]) first = await builder.flush() - assert first["data/cum_num_unique_scenarios"] == 3 + assert first["data/cum/num_unique_scenarios"] == 3 builder.add_data(scenario_ids=["s2", "s4"]) second = await builder.flush() - assert second["data/cum_num_unique_scenarios"] == 4 + assert second["data/cum/num_unique_scenarios"] == 4 @pytest.mark.asyncio async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None: @@ -220,7 +220,7 @@ async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None: first = await builder.flush() assert first["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) - assert first["data/cum_num_unique_scenarios"] == 1 + assert first["data/cum/num_unique_scenarios"] == 1 second = await builder.flush() assert second == {} diff --git a/tests/unit/test_metrics_taxonomy.py b/tests/unit/test_metrics_taxonomy.py index 45085476..7d22ddf2 100644 --- a/tests/unit/test_metrics_taxonomy.py +++ b/tests/unit/test_metrics_taxonomy.py @@ -1,6 +1,7 @@ import pytest from art.metrics_taxonomy import ( + TRAIN_GRADIENT_STEPS_KEY, TrajectoryBatchSummary, average_metric_samples, build_training_summary_metrics, @@ -38,3 +39,13 @@ def test_build_training_summary_metrics_includes_data_and_train_sections() -> No assert metrics["data/step_num_groups_trainable"] == pytest.approx(1.0) assert metrics["train/num_groups_submitted"] == pytest.approx(2.0) assert metrics["train/num_trajectories"] == pytest.approx(5.0) + + +def test_average_metric_samples_requires_invariant_gradient_step_count() -> None: + with pytest.raises(ValueError, match="must be invariant"): + average_metric_samples( + [ + {TRAIN_GRADIENT_STEPS_KEY: 2.0}, + {TRAIN_GRADIENT_STEPS_KEY: 3.0}, + ] + ) diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py index 897a57d2..b8ac6292 100644 --- a/tests/unit/test_track_api_cost.py +++ b/tests/unit/test_track_api_cost.py @@ -190,7 +190,7 @@ async def _eval_judge() -> _AnthropicResponse: assert first["costs/train/llm_judge/correctness"] == pytest.approx(0.0002) assert second["costs/eval/llm_judge/factuality"] == pytest.approx(0.00016) - assert second["costs/all_cum"] == pytest.approx(0.00036) + assert second["costs/cum/all"] == pytest.approx(0.00036) @pytest.mark.asyncio async def test_pipeline_trainer_activates_train_context_for_rollouts( From 8c2042c1cb939db20844ffa7bb8b400e5447e72d Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 15:31:02 -0700 Subject: [PATCH 29/46] fix: Require Model-Aware Api Cost Pricing --- docs/metrics-taxonomy.md | 20 ++-- examples/metrics_taxonomy_smoke.py | 3 +- src/art/metrics.py | 24 +++-- src/art/metrics_api_cost.py | 146 +++++++++++++++++++++++------ tests/unit/test_metrics_builder.py | 11 ++- tests/unit/test_track_api_cost.py | 44 +++++++-- 6 files changed, 191 insertions(+), 57 deletions(-) diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md index 38c12d1d..bc4f7980 100644 --- a/docs/metrics-taxonomy.md +++ b/docs/metrics-taxonomy.md @@ -105,12 +105,11 @@ from art.metrics import track_api_cost @track_api_cost( source="llm_judge/correctness", provider="openai", - prompt_price_per_million=1.0, - completion_price_per_million=2.0, + model_name="openai/gpt-oss-20b", ) async def run_judge(client, messages): return await client.chat.completions.create( - model="gpt-4o-mini", + model="gpt-oss-20b", messages=messages, ) ``` @@ -137,15 +136,24 @@ The next `model.log(...)` flush for that step will include: - hierarchical rollups like `costs/train`, `costs/all` - cumulative keys like `costs/cum/all` -Built-in providers: +Built-in usage extraction: - OpenAI usage (`prompt_tokens`, `completion_tokens`) - Anthropic usage (`input_tokens`, `output_tokens`) -You can override pricing per decorator call or configure builder-level defaults: +Pricing is model-aware by default. ART will use the configured model pricing from +`art.costs.MODEL_PRICING` when it can resolve a concrete model name, and it +raises instead of guessing when pricing is missing. + +You can still override pricing per decorator call or register model-specific +pricing on the builder: ```python builder = model.metrics_builder() -builder.register_token_pricing("openai", prompt_per_million=1.2, completion_per_million=4.8) +builder.register_model_pricing( + "anthropic/my-custom-judge", + prompt_per_million=1.2, + completion_per_million=4.8, +) builder.register_cost_extractor("openai", lambda response: 0.001) # optional custom extractor ``` diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py index 4f2c4a2f..ff4d4afe 100644 --- a/examples/metrics_taxonomy_smoke.py +++ b/examples/metrics_taxonomy_smoke.py @@ -25,8 +25,7 @@ def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: @track_api_cost( source="llm_judge/decorator_demo", provider="openai", - prompt_price_per_million=1.0, - completion_price_per_million=2.0, + model_name="openai/gpt-oss-20b", ) async def _mock_judge_call(step: int) -> _Response: return _Response( diff --git a/src/art/metrics.py b/src/art/metrics.py index 0e9846d8..29946316 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -8,8 +8,8 @@ from typing import Any from .metrics_api_cost import ( - DEFAULT_TOKEN_PRICING, CostExtractor, + ModelNameGetter, TokenPricing, extract_api_cost, normalize_provider, @@ -51,7 +51,7 @@ class _SharedMetricsState: unique_scenario_ids: set[str] pending_scenario_ids: set[str] cost_extractors: dict[str, CostExtractor] - token_pricing: dict[str, TokenPricing] + model_pricing: dict[str, TokenPricing] def _new_shared_metrics_state() -> _SharedMetricsState: @@ -62,7 +62,7 @@ def _new_shared_metrics_state() -> _SharedMetricsState: unique_scenario_ids=set(), pending_scenario_ids=set(), cost_extractors={}, - token_pricing=dict(DEFAULT_TOKEN_PRICING), + model_pricing={}, ) @@ -95,6 +95,8 @@ def add_response_cost( response: Any, *, provider: str | None = None, + model_name: str | None = None, + model_name_getter: "ModelNameGetter | None" = None, prompt_price_per_million: float | None = None, completion_price_per_million: float | None = None, ) -> float | None: @@ -105,10 +107,12 @@ def add_response_cost( cost = extract_api_cost( response, provider=provider, + model_name=model_name, + model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, cost_extractors=self._shared_state.cost_extractors, - token_pricing=self._shared_state.token_pricing, + model_pricing=self._shared_state.model_pricing, ) if cost is None: return None @@ -238,17 +242,17 @@ def register_cost_extractor( raise ValueError("provider must be non-empty") self._shared_state.cost_extractors[normalized_provider] = extractor - def register_token_pricing( + def register_model_pricing( self, - provider: str, + model_name: str, *, prompt_per_million: float, completion_per_million: float, ) -> None: - normalized_provider = normalize_provider(provider) - if normalized_provider is None: - raise ValueError("provider must be non-empty") - self._shared_state.token_pricing[normalized_provider] = TokenPricing( + normalized_model_name = model_name.strip() + if not normalized_model_name: + raise ValueError("model_name must be non-empty") + self._shared_state.model_pricing[normalized_model_name] = TokenPricing( prompt_per_million=float(prompt_per_million), completion_per_million=float(completion_per_million), ) diff --git a/src/art/metrics_api_cost.py b/src/art/metrics_api_cost.py index f9a8d3eb..6d713192 100644 --- a/src/art/metrics_api_cost.py +++ b/src/art/metrics_api_cost.py @@ -6,9 +6,8 @@ from inspect import iscoroutinefunction from typing import Any, ParamSpec, TypeVar -from .costs import tokens_to_cost +from .costs import get_model_pricing, tokens_to_cost -DEFAULT_PROVIDER = "openai" OPENAI_PROVIDER = "openai" ANTHROPIC_PROVIDER = "anthropic" @@ -16,6 +15,7 @@ R = TypeVar("R") CostExtractor = Callable[[Any], float | None] +ModelNameGetter = Callable[[Any], str | None] ResponseGetter = Callable[[Any], Any] @@ -24,16 +24,6 @@ class TokenPricing: prompt_per_million: float completion_per_million: float - -DEFAULT_TOKEN_PRICING = { - OPENAI_PROVIDER: TokenPricing(prompt_per_million=2.5, completion_per_million=10.0), - ANTHROPIC_PROVIDER: TokenPricing( - prompt_per_million=3.0, - completion_per_million=15.0, - ), -} - - def normalize_provider(provider: str | None) -> str | None: if provider is None: return None @@ -61,6 +51,17 @@ def _response_usage(response: Any) -> Any: return getattr(response, "usage", None) +def _response_model_name(response: Any) -> str | None: + if isinstance(response, dict): + value = response.get("model") + else: + value = getattr(response, "model", None) + if value is None: + return None + normalized = str(value).strip() + return normalized or None + + def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None: usage = _response_usage(response) prompt_tokens = _read_usage_field(usage, "prompt_tokens") @@ -110,28 +111,97 @@ def _estimate_cost( ) -def _resolve_token_pricing( +def _resolve_model_name( + response: Any, + *, provider: str | None, + model_name: str | None, + model_name_getter: ModelNameGetter | None, +) -> str | None: + explicit_model_name = model_name.strip() if model_name is not None else None + if explicit_model_name: + candidate = explicit_model_name + elif model_name_getter is not None: + candidate = model_name_getter(response) + else: + candidate = _response_model_name(response) + + if candidate is None: + return None + + normalized_model_name = str(candidate).strip() + if not normalized_model_name: + return None + + normalized_provider = normalize_provider(provider) + if normalized_provider is not None and "/" not in normalized_model_name: + provider_scoped_name = f"{normalized_provider}/{normalized_model_name}" + if get_model_pricing(provider_scoped_name) is not None: + return provider_scoped_name + + return normalized_model_name + + +def _resolve_token_pricing( + response: Any, *, + provider: str | None, + model_name: str | None, + model_name_getter: ModelNameGetter | None, prompt_price_per_million: float | None, completion_price_per_million: float | None, - token_pricing: Mapping[str, TokenPricing], + model_pricing: Mapping[str, TokenPricing], ) -> TokenPricing: - normalized_provider = normalize_provider(provider) or DEFAULT_PROVIDER - default_pricing = token_pricing.get( - normalized_provider, - token_pricing[DEFAULT_PROVIDER], + explicit_prompt_price = ( + float(prompt_price_per_million) + if prompt_price_per_million is not None + else None + ) + explicit_completion_price = ( + float(completion_price_per_million) + if completion_price_per_million is not None + else None ) + if ( + explicit_prompt_price is not None + and explicit_completion_price is not None + ): + return TokenPricing( + prompt_per_million=explicit_prompt_price, + completion_per_million=explicit_completion_price, + ) + + resolved_model_name = _resolve_model_name( + response, + provider=provider, + model_name=model_name, + model_name_getter=model_name_getter, + ) + if resolved_model_name is None: + raise ValueError( + "API cost tracking requires model-aware pricing. " + "Provide both explicit token prices or supply a model_name " + "(or response.model / model_name_getter) with configured pricing." + ) + + configured_pricing = model_pricing.get(resolved_model_name) + if configured_pricing is None: + pricing = get_model_pricing(resolved_model_name, strict=True) + configured_pricing = TokenPricing( + prompt_per_million=pricing.prefill, + completion_per_million=pricing.sample, + ) + return TokenPricing( prompt_per_million=( - float(prompt_price_per_million) - if prompt_price_per_million is not None - else default_pricing.prompt_per_million + explicit_prompt_price + if explicit_prompt_price is not None + else configured_pricing.prompt_per_million ), completion_per_million=( - float(completion_price_per_million) - if completion_price_per_million is not None - else default_pricing.completion_per_million + explicit_completion_price + if explicit_completion_price is not None + else configured_pricing.completion_per_million ), ) @@ -140,10 +210,12 @@ def extract_api_cost( response: Any, *, provider: str | None, + model_name: str | None, + model_name_getter: ModelNameGetter | None, prompt_price_per_million: float | None, completion_price_per_million: float | None, cost_extractors: Mapping[str, CostExtractor], - token_pricing: Mapping[str, TokenPricing], + model_pricing: Mapping[str, TokenPricing], ) -> float | None: provider_name = normalize_provider(provider) or _detect_provider(response) if provider_name is not None: @@ -154,10 +226,13 @@ def extract_api_cost( return float(custom_cost) pricing = _resolve_token_pricing( - provider_name, + response, + provider=provider_name, + model_name=model_name, + model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, - token_pricing=token_pricing, + model_pricing=model_pricing, ) if provider_name == OPENAI_PROVIDER: return _estimate_cost(_extract_openai_token_counts(response), pricing) @@ -165,10 +240,13 @@ def extract_api_cost( return _estimate_cost(_extract_anthropic_token_counts(response), pricing) pricing = _resolve_token_pricing( - provider_name, + response, + provider=provider_name, + model_name=model_name, + model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, - token_pricing=token_pricing, + model_pricing=model_pricing, ) token_counts = _extract_openai_token_counts(response) if token_counts is None: @@ -182,6 +260,8 @@ def _record_api_cost( source: str, provider: str | None, response_getter: ResponseGetter | None, + model_name: str | None, + model_name_getter: ModelNameGetter | None, prompt_price_per_million: float | None, completion_price_per_million: float | None, ) -> None: @@ -197,6 +277,8 @@ def _record_api_cost( source, response, provider=provider, + model_name=model_name, + model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, ) @@ -206,6 +288,8 @@ def track_api_cost( *, source: str, provider: str | None = None, + model_name: str | None = None, + model_name_getter: ModelNameGetter | None = None, response_getter: ResponseGetter | None = None, prompt_price_per_million: float | None = None, completion_price_per_million: float | None = None, @@ -227,6 +311,8 @@ async def _async_wrapper(*args: P.args, **kwargs: P.kwargs): source=normalized_source, provider=normalized_provider, response_getter=response_getter, + model_name=model_name, + model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, ) @@ -242,6 +328,8 @@ def _sync_wrapper(*args: P.args, **kwargs: P.kwargs): source=normalized_source, provider=normalized_provider, response_getter=response_getter, + model_name=model_name, + model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, ) diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py index d1df45e8..57def527 100644 --- a/tests/unit/test_metrics_builder.py +++ b/tests/unit/test_metrics_builder.py @@ -183,17 +183,20 @@ async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None: assert metrics["costs/cum/all"] == pytest.approx(3.0) @pytest.mark.asyncio - async def test_add_response_cost_uses_registered_pricing(self) -> None: + async def test_add_response_cost_uses_registered_model_pricing(self) -> None: builder = MetricsBuilder(cost_context="eval") - builder.register_token_pricing( - "anthropic", + builder.register_model_pricing( + "anthropic/test-judge", prompt_per_million=5.0, completion_per_million=7.0, ) cost = builder.add_response_cost( "llm_judge/faithfulness", - {"usage": {"input_tokens": 40, "output_tokens": 60}}, + { + "model": "anthropic/test-judge", + "usage": {"input_tokens": 40, "output_tokens": 60}, + }, ) metrics = await builder.flush() diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py index b8ac6292..5162595f 100644 --- a/tests/unit/test_track_api_cost.py +++ b/tests/unit/test_track_api_cost.py @@ -17,8 +17,15 @@ def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: class _OpenAIResponse: - def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: + def __init__( + self, + prompt_tokens: int, + completion_tokens: int, + *, + model: str | None = None, + ) -> None: self.usage = _OpenAIUsage(prompt_tokens, completion_tokens) + self.model = model class _AnthropicUsage: @@ -28,8 +35,15 @@ def __init__(self, input_tokens: int, output_tokens: int) -> None: class _AnthropicResponse: - def __init__(self, input_tokens: int, output_tokens: int) -> None: + def __init__( + self, + input_tokens: int, + output_tokens: int, + *, + model: str | None = None, + ) -> None: self.usage = _AnthropicUsage(input_tokens, output_tokens) + self.model = model class TestTrackApiCost: @@ -56,15 +70,18 @@ async def _judge() -> _OpenAIResponse: assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002) @pytest.mark.asyncio - async def test_anthropic_cost_extraction_uses_registered_pricing(self) -> None: + async def test_anthropic_cost_extraction_uses_registered_model_pricing(self) -> None: builder = MetricsBuilder(cost_context="train") - builder.register_token_pricing( - "anthropic", + builder.register_model_pricing( + "anthropic/test-judge", prompt_per_million=5.0, completion_per_million=7.0, ) - @track_api_cost(source="llm_judge/faithfulness") + @track_api_cost( + source="llm_judge/faithfulness", + model_name="anthropic/test-judge", + ) async def _judge() -> _AnthropicResponse: return _AnthropicResponse(input_tokens=40, output_tokens=60) @@ -77,6 +94,21 @@ async def _judge() -> _AnthropicResponse: metrics = await builder.flush() assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062) + @pytest.mark.asyncio + async def test_decorator_fails_fast_without_model_aware_pricing(self) -> None: + builder = MetricsBuilder(cost_context="train") + + @track_api_cost(source="llm_judge/missing_pricing", provider="openai") + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=10, completion_tokens=20) + + token = builder.activate() + try: + with pytest.raises(ValueError, match="model-aware pricing"): + await _judge() + finally: + token.var.reset(token) + @pytest.mark.asyncio async def test_custom_extractor_takes_precedence(self) -> None: builder = MetricsBuilder(cost_context="train") From d2e92131722a6ed57cda5c4c010679ab443e707b Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 15:32:42 -0700 Subject: [PATCH 30/46] fix: Normalize Unsloth Eval Metric Routing --- src/art/unsloth/train.py | 16 ++++++++++------ tests/unit/test_unsloth_metrics.py | 25 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 tests/unit/test_unsloth_metrics.py diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py index f095fe35..4b8d15d7 100644 --- a/src/art/unsloth/train.py +++ b/src/art/unsloth/train.py @@ -12,7 +12,7 @@ from .. import dev from ..loss import loss_fn, shift_tensor -from ..metrics_taxonomy import rename_train_metrics +from ..metrics_taxonomy import rename_train_metric_key, rename_train_metrics from ..types import TrainConfig if TYPE_CHECKING: @@ -194,12 +194,16 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None: } # average the metrics # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs` - # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format. + # start with "eval_". Normalize them into the `val/...` taxonomy instead. if next(iter(logs.keys())).startswith("eval_"): - metrics = {f"eval_{key}": val for key, val in metrics.items()} - - logs = {**rename_train_metrics(logs), **metrics} - results_queue.put_nowait(logs) + normalized_metrics = {f"val/{key}": val for key, val in metrics.items()} + normalized_logs = { + f"val/{rename_train_metric_key(key[len('eval_') :])}": val + for key, val in logs.items() + } + results_queue.put_nowait({**normalized_metrics, **normalized_logs}) + else: + results_queue.put_nowait({**rename_train_metrics(logs), **metrics}) trainer._metrics["train"].clear() return log diff --git a/tests/unit/test_unsloth_metrics.py b/tests/unit/test_unsloth_metrics.py new file mode 100644 index 00000000..fdb91b0c --- /dev/null +++ b/tests/unit/test_unsloth_metrics.py @@ -0,0 +1,25 @@ +import asyncio +from collections import defaultdict + +from art.unsloth.train import get_log_fn + + +class _DummyTrainer: + def __init__(self) -> None: + self._metrics = {"train": defaultdict(list)} + + +def test_get_log_fn_routes_eval_metrics_to_val_namespace() -> None: + trainer = _DummyTrainer() + trainer._metrics["train"]["loss/train"].append(1.5) + trainer._metrics["train"]["loss/entropy"].append(0.2) + results_queue: asyncio.Queue[dict[str, float]] = asyncio.Queue() + + log = get_log_fn(trainer, results_queue) + log({"eval_loss": 1.0, "eval_runtime": 2.0}) + + assert results_queue.get_nowait() == { + "val/loss/train": 1.0, + "val/loss/entropy": 0.2, + "val/runtime": 2.0, + } From 5c46148b4bd8b3b537cf5e8644e7d1828dfe927c Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 15:56:36 -0700 Subject: [PATCH 31/46] fix: Align Wandb Logging With Training Step --- src/art/model.py | 5 ++++- tests/unit/test_metric_routing.py | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/art/model.py b/src/art/model.py index aba894a0..b844f399 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -479,7 +479,10 @@ def _log_metrics( if should_log_wandb: if run := self._get_wandb_run(): self._define_wandb_step_metrics(prefixed.keys()) - run.log(prefixed) + # Keep W&B's internal step aligned with ART's training_step so + # multiple log calls for the same training step do not inflate + # the run's step count. + run.log(prefixed, step=step) def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None: import wandb diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py index d7dcd2b5..4d6316d0 100644 --- a/tests/unit/test_metric_routing.py +++ b/tests/unit/test_metric_routing.py @@ -112,3 +112,10 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step( (("costs/cum/train/prefill",), {"step_metric": "training_step"}) in define_calls ) + fake_run.log.assert_called_once() + logged_metrics = fake_run.log.call_args.args[0] + assert logged_metrics["costs/train/sample"] == 0.1 + assert logged_metrics["costs/cum/train/prefill"] == 0.2 + assert logged_metrics["training_step"] == 1 + assert "time/wall_clock_sec" in logged_metrics + assert fake_run.log.call_args.kwargs == {"step": 1} From ad51d340f11688e7d903390a2496f83aee2c6799 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 15:56:41 -0700 Subject: [PATCH 32/46] refactor: Use Backend Train In Metrics Demo --- dev/yes-no-maybe-metrics.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py index 32729990..036e3847 100644 --- a/dev/yes-no-maybe-metrics.py +++ b/dev/yes-no-maybe-metrics.py @@ -238,19 +238,22 @@ async def main() -> None: train_builder.add_data( step_actor_tokens=total_actor_tokens(train_groups) ) - await model.train( + result = await backend.train( + model, train_groups, - config=art.TrainConfig(learning_rate=learning_rate), + learning_rate=learning_rate, ) - step = await model.get_step() await model.log( - trajectories=None, split="train", - step=step, - metrics={"time/step_wall_s": time.monotonic() - step_started}, + step=result.step, + trajectories=train_groups, + metrics={ + **result.metrics, + "time/step_wall_s": time.monotonic() - step_started, + }, ) - print(f"step {step} complete") + print(f"step {result.step} complete") print_history_summary(model) finally: From fe4a06b29f92f35b680d70abe4f4559c09464ab8 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 16:35:54 -0700 Subject: [PATCH 33/46] feat: Add LocalBackend wall time and GPU cost metrics --- docs/metrics-taxonomy.md | 8 ++- src/art/local/backend.py | 72 ++++++++++++++++++++++- src/art/model.py | 73 +++++++++++++++++++++-- tests/unit/test_frontend_logging.py | 89 +++++++++++++++++++++++++++++ 4 files changed, 234 insertions(+), 8 deletions(-) diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md index bc4f7980..bb9ee871 100644 --- a/docs/metrics-taxonomy.md +++ b/docs/metrics-taxonomy.md @@ -55,16 +55,22 @@ ART now emits the following metrics from library internals where the data is ava - `loss/*` from trainer backends - `time/wall_clock_sec` and `training_step` on every logged row - `time/step_trainer_s` for training calls -- `time/step_wall_s`, `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer` +- `time/step_wall_s` from `PipelineTrainer` and `LocalBackend` train-step logs +- `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer` - `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted` - `data/step_num_groups_trainable` for train splits - `data/cum/num_unique_scenarios` when scenario IDs are present in group or trajectory metadata - `data/step_trainer_tokens` where the backend knows the trainer token count +- `costs/gpu` on `LocalBackend` train-step logs when ART can resolve GPU pricing - `throughput/cum/trainer_idle_s`, `throughput/cum/actor_idle_s` - `throughput/avg_trainer_tok_per_s`, `throughput/avg_actor_tok_per_s` when both token and time inputs are available Some metrics remain user-owned because ART cannot infer them reliably for every workflow, especially actor token usage outside the pipeline trainer. +For automatic GPU cost on `LocalBackend`, ART currently auto-detects H200s at +$3/hour per GPU. For other GPU types, pass `gpu_cost_per_hour_usd=...` to +`LocalBackend(...)` if you want ART to emit `costs/gpu` instead of skipping it. + ## User Helpers Use the builder helpers for step-level metrics that only user code can know: diff --git a/src/art/local/backend.py b/src/art/local/backend.py index 1d839cf7..807a0f42 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -13,6 +13,10 @@ logger = logging.getLogger(__name__) +_AUTO_GPU_HOURLY_PRICING_USD = { + "H200": 3.0, +} + import aiohttp import numpy as np from openai import AsyncOpenAI @@ -68,7 +72,13 @@ class LocalBackend(Backend): - def __init__(self, *, in_process: bool = False, path: str | None = None) -> None: + def __init__( + self, + *, + in_process: bool = False, + path: str | None = None, + gpu_cost_per_hour_usd: float | None = None, + ) -> None: """ Initializes a local, directory-based Backend interface at the given path. @@ -79,9 +89,18 @@ def __init__(self, *, in_process: bool = False, path: str | None = None) -> None Args: in_process: Whether to run the local service in-process. path: The path to the local directory. Defaults to "{repo_root}/.art". + gpu_cost_per_hour_usd: Optional per-GPU hourly price override used for + automatic `costs/gpu` accounting on train steps. When unset, + ART auto-detects supported GPU types (H200 at $3/hr today) and + skips GPU cost logging for unknown devices instead of guessing. """ self._in_process = in_process self._path = path or get_default_art_path() + self._gpu_cost_per_hour_usd = ( + float(gpu_cost_per_hour_usd) + if gpu_cost_per_hour_usd is not None + else None + ) os.makedirs(self._path, exist_ok=True) # Other initialization @@ -89,6 +108,57 @@ def __init__(self, *, in_process: bool = False, path: str | None = None) -> None self._tokenizers: dict[str, PreTrainedTokenizerBase] = {} self._image_processors: dict[str, BaseImageProcessor | None] = {} + def supports_automatic_train_step_metrics(self) -> bool: + return True + + def automatic_gpu_cost_per_hour_usd(self, model: Model) -> float | None: + per_gpu_cost = self._resolve_gpu_cost_per_hour_usd() + if per_gpu_cost is None: + return None + + gpu_count = self._allocated_gpu_count(model) + if gpu_count <= 0: + return None + return per_gpu_cost * gpu_count + + def _resolve_gpu_cost_per_hour_usd(self) -> float | None: + if self._gpu_cost_per_hour_usd is not None: + return self._gpu_cost_per_hour_usd + if not torch.cuda.is_available(): + return None + + num_visible_gpus = torch.cuda.device_count() + if num_visible_gpus <= 0: + return None + + resolved_costs: list[float] = [] + for index in range(num_visible_gpus): + device_name = torch.cuda.get_device_name(index).upper() + for gpu_name, hourly_cost in _AUTO_GPU_HOURLY_PRICING_USD.items(): + if gpu_name in device_name: + resolved_costs.append(hourly_cost) + break + else: + return None + + if not resolved_costs: + return None + if len(set(resolved_costs)) != 1: + return None + return resolved_costs[0] + + def _allocated_gpu_count(self, model: Model) -> int: + if isinstance(model, TrainableModel) and model._internal_config is not None: + trainer_gpu_ids = set(model._internal_config.get("trainer_gpu_ids", [])) + inference_gpu_ids = set(model._internal_config.get("inference_gpu_ids", [])) + allocated_gpu_ids = trainer_gpu_ids | inference_gpu_ids + if allocated_gpu_ids: + return len(allocated_gpu_ids) + + if not torch.cuda.is_available(): + return 0 + return torch.cuda.device_count() + def __enter__(self) -> Self: return self diff --git a/src/art/model.py b/src/art/model.py index b844f399..62b4fcd8 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -118,6 +118,9 @@ class Model( _wandb_run: Optional["Run"] = None # Private, for lazy wandb initialization _wandb_defined_metrics: set[str] _run_start_time: float + _run_start_monotonic: float + _last_local_train_log_monotonic: float + _last_local_train_step: int | None _metrics_builder: MetricsBuilder _metrics_builder_state_loaded: bool _cost_calculator: CostCalculator @@ -151,6 +154,9 @@ def __init__( ) object.__setattr__(self, "_wandb_defined_metrics", set()) object.__setattr__(self, "_run_start_time", time.time()) + object.__setattr__(self, "_run_start_monotonic", time.monotonic()) + object.__setattr__(self, "_last_local_train_log_monotonic", self._run_start_monotonic) + object.__setattr__(self, "_last_local_train_step", None) object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train")) object.__setattr__(self, "_metrics_builder_state_loaded", False) @@ -495,7 +501,7 @@ def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None: wandb.define_metric(key, step_metric="training_step") self._wandb_defined_metrics.add(key) - def _route_metrics_and_extract_non_costs( + def _route_metrics_and_collect_non_costs( self, metrics: dict[str, float], split: str ) -> dict[str, float]: non_cost_metrics: dict[str, float] = {} @@ -523,6 +529,44 @@ def _route_metrics_and_extract_non_costs( non_cost_metrics[metric] = numeric_value return non_cost_metrics + def _collect_automatic_backend_metrics( + self, + *, + split: str, + step: int, + provided_metric_keys: set[str], + ) -> dict[str, float]: + if split != "train" or self._backend is None: + return {} + + supports_step_metrics = getattr( + self._backend, "supports_automatic_train_step_metrics", None + ) + if not callable(supports_step_metrics) or not supports_step_metrics(): + return {} + + if self._last_local_train_step == step: + return {} + + now = time.monotonic() + step_wall_s = max(0.0, now - self._last_local_train_log_monotonic) + object.__setattr__(self, "_last_local_train_log_monotonic", now) + object.__setattr__(self, "_last_local_train_step", step) + + automatic_metrics: dict[str, float] = {} + if "time/step_wall_s" not in provided_metric_keys: + automatic_metrics["time/step_wall_s"] = step_wall_s + + gpu_cost_getter = getattr(self._backend, "automatic_gpu_cost_per_hour_usd", None) + if callable(gpu_cost_getter) and "costs/gpu" not in provided_metric_keys: + gpu_cost_per_hour_usd = gpu_cost_getter(self) + if gpu_cost_per_hour_usd is not None: + automatic_metrics["costs/gpu"] = ( + step_wall_s * float(gpu_cost_per_hour_usd) / 3600.0 + ) + + return automatic_metrics + def _add_default_step_metrics( self, trajectory_groups: list[TrajectoryGroup], @@ -635,7 +679,15 @@ async def log( # If only metrics provided (no trajectories), just log them and return if trajectories is None: if metrics is not None: - metrics_without_costs = self._route_metrics_and_extract_non_costs( + provided_metric_keys = set(metrics) + automatic_metrics = self._collect_automatic_backend_metrics( + split=split, + step=step, + provided_metric_keys=provided_metric_keys, + ) + if automatic_metrics: + self._route_metrics_and_collect_non_costs(automatic_metrics, split) + metrics_without_costs = self._route_metrics_and_collect_non_costs( metrics, split ) builder_metrics = await self._metrics_builder.flush() @@ -646,11 +698,20 @@ async def log( return trajectory_groups = self._normalize_trajectory_groups(trajectories) + provided_metric_keys = set(metrics or {}) + + automatic_metrics = self._collect_automatic_backend_metrics( + split=split, + step=step, + provided_metric_keys=provided_metric_keys, + ) + if automatic_metrics: + self._route_metrics_and_collect_non_costs(automatic_metrics, split) default_train_metrics = self._add_default_step_metrics( trajectory_groups, split=split, - provided_metric_keys=set(metrics or {}), + provided_metric_keys=provided_metric_keys, ) # Ensure output directories exist @@ -679,7 +740,7 @@ async def log( for group in trajectory_groups: if group.metrics: - group_non_cost = self._route_metrics_and_extract_non_costs( + group_non_cost = self._route_metrics_and_collect_non_costs( cast(dict[str, float], group.metrics), split ) else: @@ -704,7 +765,7 @@ async def log( routed_metric = f"reward/{routed_metric}" trajectory_metrics[routed_metric] = float(value) - non_cost_trajectory_metrics = self._route_metrics_and_extract_non_costs( + non_cost_trajectory_metrics = self._route_metrics_and_collect_non_costs( trajectory_metrics, split, ) @@ -738,7 +799,7 @@ async def log( # Merge in any additional metrics passed directly if metrics is not None: - metrics_without_costs = self._route_metrics_and_extract_non_costs( + metrics_without_costs = self._route_metrics_and_collect_non_costs( metrics, split ) averages.update(metrics_without_costs) diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 755d7e64..c0adcb40 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -898,6 +898,95 @@ def test_should_log_wandb_logic_empty_list(self, tmp_path: Path): assert should_log is False +class TestLocalBackendAutomaticMetrics: + @pytest.mark.asyncio + async def test_train_logs_automatic_wall_time_and_gpu_cost( + self, tmp_path: Path + ) -> None: + backend = LocalBackend(gpu_cost_per_hour_usd=3.0) + + with patch("art.model.time.monotonic", side_effect=[100.0, 106.0, 111.0]): + model = TrainableModel( + name="test-model", + project="test-project", + base_model="Qwen/Qwen3-4B-Instruct-2507", + base_path=str(tmp_path), + report_metrics=[], + _internal_config={"trainer_gpu_ids": [0]}, + ) + model._backend = backend + + await model.log( + trajectories=None, + split="train", + step=1, + metrics={"loss/train": 1.0}, + ) + await model.log( + trajectories=None, + split="train", + step=2, + metrics={"loss/train": 0.5}, + ) + + history_path = tmp_path / "test-project/models/test-model/history.jsonl" + rows = [json.loads(line) for line in history_path.open() if line.strip()] + + first_gpu_cost = 6.0 * 3.0 / 3600.0 + second_gpu_cost = 5.0 * 3.0 / 3600.0 + + assert rows[0]["time/step_wall_s"] == pytest.approx(6.0) + assert rows[0]["costs/gpu"] == pytest.approx(first_gpu_cost) + assert rows[0]["costs/all"] == pytest.approx(first_gpu_cost) + assert rows[0]["costs/cum/gpu"] == pytest.approx(first_gpu_cost) + + assert rows[1]["time/step_wall_s"] == pytest.approx(5.0) + assert rows[1]["costs/gpu"] == pytest.approx(second_gpu_cost) + assert rows[1]["costs/cum/gpu"] == pytest.approx( + first_gpu_cost + second_gpu_cost + ) + assert rows[1]["costs/cum/all"] == pytest.approx( + first_gpu_cost + second_gpu_cost + ) + + @pytest.mark.asyncio + async def test_unknown_local_gpu_skips_cost_but_keeps_wall_time( + self, tmp_path: Path + ) -> None: + backend = LocalBackend() + + with patch("art.model.time.monotonic", side_effect=[50.0, 55.0]): + with patch("art.local.backend.torch.cuda.is_available", return_value=True): + with patch("art.local.backend.torch.cuda.device_count", return_value=1): + with patch( + "art.local.backend.torch.cuda.get_device_name", + return_value="NVIDIA A100-SXM4-80GB", + ): + model = TrainableModel( + name="test-model", + project="test-project", + base_model="Qwen/Qwen3-4B-Instruct-2507", + base_path=str(tmp_path), + report_metrics=[], + _internal_config={"trainer_gpu_ids": [0]}, + ) + model._backend = backend + await model.log( + trajectories=None, + split="train", + step=1, + metrics={"loss/train": 1.0}, + ) + + history_path = tmp_path / "test-project/models/test-model/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["time/step_wall_s"] == pytest.approx(5.0) + assert "costs/gpu" not in entry + assert "costs/all" not in entry + + class TestModelAttributes: """Test new Model attributes.""" From 096c04235beeb5c128f4b8a0fdee8af2398986c6 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Mon, 9 Mar 2026 16:35:59 -0700 Subject: [PATCH 34/46] refactor: Rely On LocalBackend metrics in demo --- dev/yes-no-maybe-metrics.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py index 036e3847..cbb0c5fd 100644 --- a/dev/yes-no-maybe-metrics.py +++ b/dev/yes-no-maybe-metrics.py @@ -2,7 +2,8 @@ This keeps the same prompt family, rollout structure, and reward ordering as `dev/yes-no-maybe.py` while adding explicit metrics taxonomy instrumentation for -actor/eval timing and data metrics. +actor/eval timing and data metrics, while relying on LocalBackend for automatic +step wall time and GPU cost logging. """ from __future__ import annotations @@ -217,7 +218,6 @@ async def main() -> None: await model.log(val_groups, split="val", step=current_step) train_builder = model.metrics_builder("train") - step_started = time.monotonic() with train_builder.activate_context(): with train_builder.measure("time/step_actor_s"): train_groups = await art.gather_trajectory_groups( @@ -248,10 +248,7 @@ async def main() -> None: split="train", step=result.step, trajectories=train_groups, - metrics={ - **result.metrics, - "time/step_wall_s": time.monotonic() - step_started, - }, + metrics=result.metrics, ) print(f"step {result.step} complete") From 67ff7262588dcba82a2c53b8dfccdb8e03c082ad Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 10:34:23 -0700 Subject: [PATCH 35/46] fix: preserve out-of-order wandb metric logging --- src/art/model.py | 8 ++++---- tests/unit/test_metric_routing.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/art/model.py b/src/art/model.py index 62b4fcd8..b4d7bdef 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -485,10 +485,10 @@ def _log_metrics( if should_log_wandb: if run := self._get_wandb_run(): self._define_wandb_step_metrics(prefixed.keys()) - # Keep W&B's internal step aligned with ART's training_step so - # multiple log calls for the same training step do not inflate - # the run's step count. - run.log(prefixed, step=step) + # Let W&B use its own monotonically increasing history step. + # ART's `training_step` remains the x-axis via define_metric, + # which preserves out-of-order eval logging. + run.log(prefixed) def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None: import wandb diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py index 4d6316d0..f9904527 100644 --- a/tests/unit/test_metric_routing.py +++ b/tests/unit/test_metric_routing.py @@ -118,4 +118,4 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step( assert logged_metrics["costs/cum/train/prefill"] == 0.2 assert logged_metrics["training_step"] == 1 assert "time/wall_clock_sec" in logged_metrics - assert fake_run.log.call_args.kwargs == {"step": 1} + assert fake_run.log.call_args.kwargs == {} From fab790762ed5618d191eccd800b91e2e1e3aeb59 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 10:34:45 -0700 Subject: [PATCH 36/46] fix: account for cached API token pricing --- src/art/metrics.py | 24 ++ src/art/metrics_api_cost.py | 365 ++++++++++++++++++++++++++---- tests/unit/test_track_api_cost.py | 188 ++++++++++++++- 3 files changed, 527 insertions(+), 50 deletions(-) diff --git a/src/art/metrics.py b/src/art/metrics.py index 29946316..d2ba6358 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -99,6 +99,9 @@ def add_response_cost( model_name_getter: "ModelNameGetter | None" = None, prompt_price_per_million: float | None = None, completion_price_per_million: float | None = None, + cached_prompt_price_per_million: float | None = None, + cache_creation_price_per_million: float | None = None, + cache_read_price_per_million: float | None = None, ) -> float | None: normalized_source = source.strip("/") if not normalized_source: @@ -111,6 +114,9 @@ def add_response_cost( model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, cost_extractors=self._shared_state.cost_extractors, model_pricing=self._shared_state.model_pricing, ) @@ -248,6 +254,9 @@ def register_model_pricing( *, prompt_per_million: float, completion_per_million: float, + cached_prompt_per_million: float | None = None, + cache_creation_per_million: float | None = None, + cache_read_per_million: float | None = None, ) -> None: normalized_model_name = model_name.strip() if not normalized_model_name: @@ -255,6 +264,21 @@ def register_model_pricing( self._shared_state.model_pricing[normalized_model_name] = TokenPricing( prompt_per_million=float(prompt_per_million), completion_per_million=float(completion_per_million), + cached_prompt_per_million=( + float(cached_prompt_per_million) + if cached_prompt_per_million is not None + else None + ), + cache_creation_per_million=( + float(cache_creation_per_million) + if cache_creation_per_million is not None + else None + ), + cache_read_per_million=( + float(cache_read_per_million) + if cache_read_per_million is not None + else None + ), ) def state_dict(self) -> dict[str, Any]: diff --git a/src/art/metrics_api_cost.py b/src/art/metrics_api_cost.py index 6d713192..a98332f8 100644 --- a/src/art/metrics_api_cost.py +++ b/src/art/metrics_api_cost.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from functools import wraps from inspect import iscoroutinefunction +import re from typing import Any, ParamSpec, TypeVar from .costs import get_model_pricing, tokens_to_cost @@ -23,6 +24,53 @@ class TokenPricing: prompt_per_million: float completion_per_million: float + cached_prompt_per_million: float | None = None + cache_creation_per_million: float | None = None + cache_read_per_million: float | None = None + + +@dataclass(frozen=True) +class _OpenAITokenUsage: + prompt_tokens: float + completion_tokens: float + cached_prompt_tokens: float + + +@dataclass(frozen=True) +class _AnthropicTokenUsage: + input_tokens: float + output_tokens: float + cache_creation_input_tokens: float + cache_read_input_tokens: float + + +_DEFAULT_TOKEN_PRICING: dict[str, TokenPricing] = { + "openai/gpt-4.1": TokenPricing( + prompt_per_million=2.0, + completion_per_million=8.0, + cached_prompt_per_million=0.5, + ), + "anthropic/claude-sonnet-4-6": TokenPricing( + prompt_per_million=3.0, + completion_per_million=15.0, + cache_creation_per_million=3.75, + cache_read_per_million=0.30, + ), +} + + +def _default_token_pricing(model_name: str) -> TokenPricing | None: + explicit = _DEFAULT_TOKEN_PRICING.get(model_name) + if explicit is not None: + return explicit + + pricing = get_model_pricing(model_name) + if pricing is None: + return None + return TokenPricing( + prompt_per_million=pricing.prefill, + completion_per_million=pricing.sample, + ) def normalize_provider(provider: str | None) -> str | None: if provider is None: @@ -45,6 +93,20 @@ def _read_usage_field(usage: Any, field: str) -> float | None: return float(value) +def _read_usage_nested_field(usage: Any, *fields: str) -> float | None: + current = usage + for field in fields: + if current is None: + return None + if isinstance(current, dict): + current = current.get(field) + else: + current = getattr(current, field, None) + if current is None: + return None + return float(current) + + def _response_usage(response: Any) -> Any: if isinstance(response, dict): return response.get("usage") @@ -62,22 +124,50 @@ def _response_model_name(response: Any) -> str | None: return normalized or None -def _extract_openai_token_counts(response: Any) -> tuple[float, float] | None: +def _extract_openai_token_counts(response: Any) -> _OpenAITokenUsage | None: usage = _response_usage(response) prompt_tokens = _read_usage_field(usage, "prompt_tokens") completion_tokens = _read_usage_field(usage, "completion_tokens") - if prompt_tokens is None and completion_tokens is None: + cached_prompt_tokens = ( + _read_usage_nested_field(usage, "prompt_tokens_details", "cached_tokens") or 0.0 + ) + if ( + prompt_tokens is None + and completion_tokens is None + and cached_prompt_tokens == 0.0 + ): return None - return prompt_tokens or 0.0, completion_tokens or 0.0 + total_prompt_tokens = prompt_tokens or 0.0 + return _OpenAITokenUsage( + prompt_tokens=total_prompt_tokens, + completion_tokens=completion_tokens or 0.0, + cached_prompt_tokens=min(cached_prompt_tokens, total_prompt_tokens), + ) -def _extract_anthropic_token_counts(response: Any) -> tuple[float, float] | None: +def _extract_anthropic_token_counts(response: Any) -> _AnthropicTokenUsage | None: usage = _response_usage(response) input_tokens = _read_usage_field(usage, "input_tokens") output_tokens = _read_usage_field(usage, "output_tokens") - if input_tokens is None and output_tokens is None: + cache_creation_input_tokens = ( + _read_usage_field(usage, "cache_creation_input_tokens") or 0.0 + ) + cache_read_input_tokens = ( + _read_usage_field(usage, "cache_read_input_tokens") or 0.0 + ) + if ( + input_tokens is None + and output_tokens is None + and cache_creation_input_tokens == 0.0 + and cache_read_input_tokens == 0.0 + ): return None - return input_tokens or 0.0, output_tokens or 0.0 + return _AnthropicTokenUsage( + input_tokens=input_tokens or 0.0, + output_tokens=output_tokens or 0.0, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, + ) def _detect_provider(response: Any) -> str | None: @@ -98,16 +188,145 @@ def _detect_provider(response: Any) -> str | None: return None -def _estimate_cost( - token_counts: tuple[float, float] | None, +def _estimate_openai_cost( + token_counts: _OpenAITokenUsage | None, pricing: TokenPricing, ) -> float | None: if token_counts is None: return None - prompt_tokens, completion_tokens = token_counts - return tokens_to_cost(prompt_tokens, pricing.prompt_per_million) + tokens_to_cost( - completion_tokens, - pricing.completion_per_million, + uncached_prompt_tokens = max( + token_counts.prompt_tokens - token_counts.cached_prompt_tokens, + 0.0, + ) + cached_prompt_price = ( + pricing.cached_prompt_per_million + if pricing.cached_prompt_per_million is not None + else pricing.prompt_per_million + ) + return ( + tokens_to_cost(uncached_prompt_tokens, pricing.prompt_per_million) + + tokens_to_cost( + token_counts.cached_prompt_tokens, + cached_prompt_price, + ) + + tokens_to_cost( + token_counts.completion_tokens, + pricing.completion_per_million, + ) + ) + + +def _estimate_anthropic_cost( + token_counts: _AnthropicTokenUsage | None, + pricing: TokenPricing, +) -> float | None: + if token_counts is None: + return None + cache_creation_price = ( + pricing.cache_creation_per_million + if pricing.cache_creation_per_million is not None + else pricing.prompt_per_million + ) + cache_read_price = ( + pricing.cache_read_per_million + if pricing.cache_read_per_million is not None + else pricing.prompt_per_million + ) + return ( + tokens_to_cost(token_counts.input_tokens, pricing.prompt_per_million) + + tokens_to_cost( + token_counts.cache_creation_input_tokens, + cache_creation_price, + ) + + tokens_to_cost( + token_counts.cache_read_input_tokens, + cache_read_price, + ) + + tokens_to_cost( + token_counts.output_tokens, + pricing.completion_per_million, + ) + ) + + +def _strip_snapshot_suffix(model_name: str) -> str: + for pattern in ( + r"^(.*)-\d{4}-\d{2}-\d{2}$", + r"^(.*)-\d{8}$", + ): + match = re.match(pattern, model_name) + if match is not None: + return match.group(1) + return model_name + + +def _candidate_model_names( + normalized_model_name: str, + *, + provider: str | None, +) -> list[str]: + candidates: list[str] = [] + + def _append(candidate: str | None) -> None: + if candidate and candidate not in candidates: + candidates.append(candidate) + + _append(normalized_model_name) + _append(_strip_snapshot_suffix(normalized_model_name)) + + if provider is not None and "/" not in normalized_model_name: + _append(f"{provider}/{normalized_model_name}") + _append(f"{provider}/{_strip_snapshot_suffix(normalized_model_name)}") + + return candidates + + +def _resolve_registered_or_default_pricing( + model_name: str, + *, + model_pricing: Mapping[str, TokenPricing], +) -> TokenPricing | None: + registered = model_pricing.get(model_name) + if registered is not None: + return registered + return _default_token_pricing(model_name) + + +def _merge_token_pricing( + *, + base_pricing: TokenPricing, + prompt_price_per_million: float | None, + completion_price_per_million: float | None, + cached_prompt_price_per_million: float | None, + cache_creation_price_per_million: float | None, + cache_read_price_per_million: float | None, +) -> TokenPricing: + return TokenPricing( + prompt_per_million=( + float(prompt_price_per_million) + if prompt_price_per_million is not None + else base_pricing.prompt_per_million + ), + completion_per_million=( + float(completion_price_per_million) + if completion_price_per_million is not None + else base_pricing.completion_per_million + ), + cached_prompt_per_million=( + float(cached_prompt_price_per_million) + if cached_prompt_price_per_million is not None + else base_pricing.cached_prompt_per_million + ), + cache_creation_per_million=( + float(cache_creation_price_per_million) + if cache_creation_price_per_million is not None + else base_pricing.cache_creation_per_million + ), + cache_read_per_million=( + float(cache_read_price_per_million) + if cache_read_price_per_million is not None + else base_pricing.cache_read_per_million + ), ) @@ -117,6 +336,7 @@ def _resolve_model_name( provider: str | None, model_name: str | None, model_name_getter: ModelNameGetter | None, + model_pricing: Mapping[str, TokenPricing], ) -> str | None: explicit_model_name = model_name.strip() if model_name is not None else None if explicit_model_name: @@ -134,11 +354,19 @@ def _resolve_model_name( return None normalized_provider = normalize_provider(provider) - if normalized_provider is not None and "/" not in normalized_model_name: - provider_scoped_name = f"{normalized_provider}/{normalized_model_name}" - if get_model_pricing(provider_scoped_name) is not None: - return provider_scoped_name + candidates = _candidate_model_names( + normalized_model_name, + provider=normalized_provider, + ) + for candidate in candidates: + if _resolve_registered_or_default_pricing( + candidate, + model_pricing=model_pricing, + ) is not None: + return candidate + if normalized_provider is not None and "/" not in normalized_model_name: + return f"{normalized_provider}/{normalized_model_name}" return normalized_model_name @@ -150,6 +378,9 @@ def _resolve_token_pricing( model_name_getter: ModelNameGetter | None, prompt_price_per_million: float | None, completion_price_per_million: float | None, + cached_prompt_price_per_million: float | None, + cache_creation_price_per_million: float | None, + cache_read_price_per_million: float | None, model_pricing: Mapping[str, TokenPricing], ) -> TokenPricing: explicit_prompt_price = ( @@ -162,47 +393,61 @@ def _resolve_token_pricing( if completion_price_per_million is not None else None ) - if ( - explicit_prompt_price is not None - and explicit_completion_price is not None - ): - return TokenPricing( - prompt_per_million=explicit_prompt_price, - completion_per_million=explicit_completion_price, - ) + explicit_cached_prompt_price = ( + float(cached_prompt_price_per_million) + if cached_prompt_price_per_million is not None + else None + ) + explicit_cache_creation_price = ( + float(cache_creation_price_per_million) + if cache_creation_price_per_million is not None + else None + ) + explicit_cache_read_price = ( + float(cache_read_price_per_million) + if cache_read_price_per_million is not None + else None + ) resolved_model_name = _resolve_model_name( response, provider=provider, model_name=model_name, model_name_getter=model_name_getter, + model_pricing=model_pricing, ) if resolved_model_name is None: + if explicit_prompt_price is not None and explicit_completion_price is not None: + return TokenPricing( + prompt_per_million=explicit_prompt_price, + completion_per_million=explicit_completion_price, + cached_prompt_per_million=explicit_cached_prompt_price, + cache_creation_per_million=explicit_cache_creation_price, + cache_read_per_million=explicit_cache_read_price, + ) raise ValueError( "API cost tracking requires model-aware pricing. " "Provide both explicit token prices or supply a model_name " "(or response.model / model_name_getter) with configured pricing." ) - configured_pricing = model_pricing.get(resolved_model_name) + configured_pricing = _resolve_registered_or_default_pricing( + resolved_model_name, + model_pricing=model_pricing, + ) if configured_pricing is None: - pricing = get_model_pricing(resolved_model_name, strict=True) - configured_pricing = TokenPricing( - prompt_per_million=pricing.prefill, - completion_per_million=pricing.sample, + raise ValueError( + f"No pricing configured for model '{resolved_model_name}'. " + "Provide explicit token prices or register model pricing." ) - return TokenPricing( - prompt_per_million=( - explicit_prompt_price - if explicit_prompt_price is not None - else configured_pricing.prompt_per_million - ), - completion_per_million=( - explicit_completion_price - if explicit_completion_price is not None - else configured_pricing.completion_per_million - ), + return _merge_token_pricing( + base_pricing=configured_pricing, + prompt_price_per_million=explicit_prompt_price, + completion_price_per_million=explicit_completion_price, + cached_prompt_price_per_million=explicit_cached_prompt_price, + cache_creation_price_per_million=explicit_cache_creation_price, + cache_read_price_per_million=explicit_cache_read_price, ) @@ -214,6 +459,9 @@ def extract_api_cost( model_name_getter: ModelNameGetter | None, prompt_price_per_million: float | None, completion_price_per_million: float | None, + cached_prompt_price_per_million: float | None, + cache_creation_price_per_million: float | None, + cache_read_price_per_million: float | None, cost_extractors: Mapping[str, CostExtractor], model_pricing: Mapping[str, TokenPricing], ) -> float | None: @@ -232,12 +480,18 @@ def extract_api_cost( model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, model_pricing=model_pricing, ) if provider_name == OPENAI_PROVIDER: - return _estimate_cost(_extract_openai_token_counts(response), pricing) + return _estimate_openai_cost(_extract_openai_token_counts(response), pricing) if provider_name == ANTHROPIC_PROVIDER: - return _estimate_cost(_extract_anthropic_token_counts(response), pricing) + return _estimate_anthropic_cost( + _extract_anthropic_token_counts(response), + pricing, + ) pricing = _resolve_token_pricing( response, @@ -246,12 +500,16 @@ def extract_api_cost( model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, model_pricing=model_pricing, ) - token_counts = _extract_openai_token_counts(response) - if token_counts is None: - token_counts = _extract_anthropic_token_counts(response) - return _estimate_cost(token_counts, pricing) + openai_token_counts = _extract_openai_token_counts(response) + if openai_token_counts is not None: + return _estimate_openai_cost(openai_token_counts, pricing) + anthropic_token_counts = _extract_anthropic_token_counts(response) + return _estimate_anthropic_cost(anthropic_token_counts, pricing) def _record_api_cost( @@ -264,6 +522,9 @@ def _record_api_cost( model_name_getter: ModelNameGetter | None, prompt_price_per_million: float | None, completion_price_per_million: float | None, + cached_prompt_price_per_million: float | None, + cache_creation_price_per_million: float | None, + cache_read_price_per_million: float | None, ) -> None: try: from .metrics import MetricsBuilder @@ -281,6 +542,9 @@ def _record_api_cost( model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, ) @@ -293,6 +557,9 @@ def track_api_cost( response_getter: ResponseGetter | None = None, prompt_price_per_million: float | None = None, completion_price_per_million: float | None = None, + cached_prompt_price_per_million: float | None = None, + cache_creation_price_per_million: float | None = None, + cache_read_price_per_million: float | None = None, ) -> Callable[[Callable[P, R]], Callable[P, R]]: normalized_source = source.strip("/") if not normalized_source: @@ -315,6 +582,9 @@ async def _async_wrapper(*args: P.args, **kwargs: P.kwargs): model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, ) return result @@ -332,6 +602,9 @@ def _sync_wrapper(*args: P.args, **kwargs: P.kwargs): model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, ) return result diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py index 5162595f..c5f951cc 100644 --- a/tests/unit/test_track_api_cost.py +++ b/tests/unit/test_track_api_cost.py @@ -6,14 +6,26 @@ import pytest from art import Model, TrainableModel, Trajectory, TrajectoryGroup +from art.costs import compute_sample_costs, get_model_pricing from art.metrics import MetricsBuilder, track_api_cost from art.pipeline_trainer.trainer import PipelineTrainer class _OpenAIUsage: - def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: + def __init__( + self, + prompt_tokens: int, + completion_tokens: int, + *, + cached_tokens: int = 0, + ) -> None: self.prompt_tokens = prompt_tokens self.completion_tokens = completion_tokens + self.prompt_tokens_details = type( + "PromptTokensDetails", + (), + {"cached_tokens": cached_tokens}, + )() class _OpenAIResponse: @@ -22,16 +34,30 @@ def __init__( prompt_tokens: int, completion_tokens: int, *, + cached_tokens: int = 0, model: str | None = None, ) -> None: - self.usage = _OpenAIUsage(prompt_tokens, completion_tokens) + self.usage = _OpenAIUsage( + prompt_tokens, + completion_tokens, + cached_tokens=cached_tokens, + ) self.model = model class _AnthropicUsage: - def __init__(self, input_tokens: int, output_tokens: int) -> None: + def __init__( + self, + input_tokens: int, + output_tokens: int, + *, + cache_creation_input_tokens: int = 0, + cache_read_input_tokens: int = 0, + ) -> None: self.input_tokens = input_tokens self.output_tokens = output_tokens + self.cache_creation_input_tokens = cache_creation_input_tokens + self.cache_read_input_tokens = cache_read_input_tokens class _AnthropicResponse: @@ -40,9 +66,16 @@ def __init__( input_tokens: int, output_tokens: int, *, + cache_creation_input_tokens: int = 0, + cache_read_input_tokens: int = 0, model: str | None = None, ) -> None: - self.usage = _AnthropicUsage(input_tokens, output_tokens) + self.usage = _AnthropicUsage( + input_tokens, + output_tokens, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, + ) self.model = model @@ -69,6 +102,33 @@ async def _judge() -> _OpenAIResponse: metrics = await builder.flush() assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002) + @pytest.mark.asyncio + async def test_openai_cost_extraction_accounts_for_cached_tokens(self) -> None: + builder = MetricsBuilder(cost_context="train") + + @track_api_cost( + source="llm_judge/cached_openai", + provider="openai", + prompt_price_per_million=2.0, + completion_price_per_million=8.0, + cached_prompt_price_per_million=0.5, + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse( + prompt_tokens=2_000, + completion_tokens=100, + cached_tokens=1_500, + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/train/llm_judge/cached_openai"] == pytest.approx(0.00255) + @pytest.mark.asyncio async def test_anthropic_cost_extraction_uses_registered_model_pricing(self) -> None: builder = MetricsBuilder(cost_context="train") @@ -94,6 +154,126 @@ async def _judge() -> _AnthropicResponse: metrics = await builder.flush() assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062) + @pytest.mark.asyncio + async def test_anthropic_cost_extraction_accounts_for_cache_write_and_read( + self, + ) -> None: + builder = MetricsBuilder(cost_context="eval") + builder.register_model_pricing( + "anthropic/claude-sonnet-4-6", + prompt_per_million=3.0, + completion_per_million=15.0, + cache_creation_per_million=3.75, + cache_read_per_million=0.30, + ) + + @track_api_cost( + source="llm_judge/anthropic_cache", + provider="anthropic", + model_name="anthropic/claude-sonnet-4-6", + ) + async def _judge() -> _AnthropicResponse: + return _AnthropicResponse( + input_tokens=100, + output_tokens=50, + cache_creation_input_tokens=1_000, + cache_read_input_tokens=500, + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/eval/llm_judge/anthropic_cache"] == pytest.approx(0.00495) + + @pytest.mark.asyncio + async def test_response_model_name_resolves_provider_scoped_global_pricing( + self, + ) -> None: + builder = MetricsBuilder(cost_context="train") + pricing = get_model_pricing("openai/gpt-oss-20b") + assert pricing is not None + + @track_api_cost(source="llm_judge/global_pricing", provider="openai") + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse( + prompt_tokens=1_000, + completion_tokens=2_000, + model="gpt-oss-20b", + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + expected = compute_sample_costs( + prompt_tokens=1_000, + completion_tokens=2_000, + pricing=pricing, + ) + assert metrics["costs/train/llm_judge/global_pricing"] == pytest.approx( + expected["costs_prefill"] + expected["costs_sample"] + ) + + @pytest.mark.asyncio + async def test_response_model_name_resolves_provider_scoped_registered_pricing( + self, + ) -> None: + builder = MetricsBuilder(cost_context="eval") + builder.register_model_pricing( + "anthropic/test-judge", + prompt_per_million=1.5, + completion_per_million=2.5, + ) + + @track_api_cost(source="llm_judge/provider_resolution", provider="anthropic") + async def _judge() -> _AnthropicResponse: + return _AnthropicResponse( + input_tokens=400, + output_tokens=600, + model="test-judge", + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/eval/llm_judge/provider_resolution"] == pytest.approx( + 0.0021 + ) + + @pytest.mark.asyncio + async def test_snapshot_model_name_resolves_to_global_pricing(self) -> None: + builder = MetricsBuilder(cost_context="train") + + @track_api_cost(source="llm_judge/snapshot", provider="openai") + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse( + prompt_tokens=1_000, + completion_tokens=100, + cached_tokens=800, + model="gpt-4.1-2025-04-14", + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + expected = ((200 * 2.0) + (800 * 0.5) + (100 * 8.0)) / 1_000_000 + assert metrics["costs/train/llm_judge/snapshot"] == pytest.approx(expected) + @pytest.mark.asyncio async def test_decorator_fails_fast_without_model_aware_pricing(self) -> None: builder = MetricsBuilder(cost_context="train") From 9a48f2e39242b10ddfd60d6fd1b8f69c6832424d Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 10:34:58 -0700 Subject: [PATCH 37/46] test: add live API cost smoke tests --- pyproject.toml | 3 + tests/integration/test_live_api_cost.py | 224 ++++++++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 tests/integration/test_live_api_cost.py diff --git a/pyproject.toml b/pyproject.toml index af96ff76..2469677c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,6 +116,9 @@ force-sort-within-sections = true [tool.pytest.ini_options] asyncio_mode = "auto" +markers = [ + "live_api_cost: opt-in live API cost validation against provider endpoints", +] [tool.uv] required-version = ">=0.6.15" diff --git a/tests/integration/test_live_api_cost.py b/tests/integration/test_live_api_cost.py new file mode 100644 index 00000000..c2bd733c --- /dev/null +++ b/tests/integration/test_live_api_cost.py @@ -0,0 +1,224 @@ +import json +import os +from pathlib import Path +import urllib.request +from uuid import uuid4 + +import pytest + +from art import Model +from art.metrics import track_api_cost + +pytestmark = pytest.mark.live_api_cost + +_LIVE_ENV = "ART_RUN_LIVE_API_COST_TESTS" + + +def _require_live_test_env(*required_vars: str) -> None: + if os.environ.get(_LIVE_ENV) != "1": + pytest.skip(f"Set {_LIVE_ENV}=1 to run live API cost tests.") + missing = [name for name in required_vars if not os.environ.get(name)] + if missing: + pytest.skip(f"Missing required env vars: {', '.join(missing)}") + + +def _post_json(url: str, *, headers: dict[str, str], payload: dict) -> dict: + request = urllib.request.Request( + url, + data=json.dumps(payload).encode("utf-8"), + headers=headers, + method="POST", + ) + with urllib.request.urlopen(request, timeout=120) as response: + return json.loads(response.read().decode("utf-8")) + + +def _cacheable_prefix(word_count: int = 1500) -> str: + return " ".join(f"cache-token-{index % 16}" for index in range(word_count)) + + +def _history_rows(history_path: Path) -> list[dict]: + return [json.loads(line) for line in history_path.read_text().splitlines() if line] + + +def _openai_completion(*, api_key: str, prompt_cache_key: str, prefix: str) -> dict: + return _post_json( + "https://api.openai.com/v1/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + payload={ + "model": "gpt-4.1", + "messages": [ + {"role": "system", "content": prefix}, + {"role": "user", "content": "Reply with OK."}, + ], + "temperature": 0, + "max_completion_tokens": 4, + "prompt_cache_key": prompt_cache_key, + }, + ) + + +def _anthropic_message(*, api_key: str, prefix: str) -> dict: + return _post_json( + "https://api.anthropic.com/v1/messages", + headers={ + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + }, + payload={ + "model": "claude-sonnet-4-6", + "max_tokens": 8, + "temperature": 0, + "system": [ + { + "type": "text", + "text": prefix, + "cache_control": {"type": "ephemeral"}, + } + ], + "messages": [ + {"role": "user", "content": "Reply with OK."}, + ], + }, + ) + + +class TestLiveApiCost: + @pytest.mark.asyncio + async def test_openai_gpt_4_1_cached_prompt_cost(self, tmp_path: Path) -> None: + _require_live_test_env("OPENAI_API_KEY") + + api_key = os.environ["OPENAI_API_KEY"] + prefix = _cacheable_prefix() + prompt_cache_key = f"art-live-api-cost-{uuid4()}" + + # Warm the cache first so the tracked request can validate cached pricing. + _openai_completion( + api_key=api_key, + prompt_cache_key=prompt_cache_key, + prefix=prefix, + ) + + model = Model( + name="live-openai-api-cost", + project="live-api-cost", + base_path=str(tmp_path), + report_metrics=[], + ) + + @track_api_cost( + source="llm_judge/openai_cached_prompt", + provider="openai", + model_name="openai/gpt-4.1", + ) + def _judge() -> dict: + return _openai_completion( + api_key=api_key, + prompt_cache_key=prompt_cache_key, + prefix=prefix, + ) + + token = model.activate_metrics_context("eval") + try: + response = _judge() + finally: + token.var.reset(token) + + await model.log(trajectories=None, split="val", step=1, metrics={}) + + usage = response["usage"] + cached_tokens = usage.get("prompt_tokens_details", {}).get("cached_tokens", 0) + assert cached_tokens > 0 + + expected_cost = ( + ((usage["prompt_tokens"] - cached_tokens) * 2.0) + + (cached_tokens * 0.5) + + (usage["completion_tokens"] * 8.0) + ) / 1_000_000 + + history_path = ( + tmp_path + / "live-api-cost" + / "models" + / "live-openai-api-cost" + / "history.jsonl" + ) + row = _history_rows(history_path)[0] + assert row["costs/eval/llm_judge/openai_cached_prompt"] == pytest.approx( + expected_cost + ) + + @pytest.mark.asyncio + async def test_anthropic_claude_sonnet_4_6_prompt_cache_cost( + self, + tmp_path: Path, + ) -> None: + _require_live_test_env("ANTHROPIC_API_KEY") + + api_key = os.environ["ANTHROPIC_API_KEY"] + prefix = _cacheable_prefix() + + model = Model( + name="live-anthropic-api-cost", + project="live-api-cost", + base_path=str(tmp_path), + report_metrics=[], + ) + + @track_api_cost( + source="llm_judge/anthropic_prompt_cache", + provider="anthropic", + model_name="anthropic/claude-sonnet-4-6", + ) + def _judge() -> dict: + return _anthropic_message(api_key=api_key, prefix=prefix) + + token = model.activate_metrics_context("eval") + try: + first_response = _judge() + finally: + token.var.reset(token) + await model.log(trajectories=None, split="val", step=1, metrics={}) + + token = model.activate_metrics_context("eval") + try: + second_response = _judge() + finally: + token.var.reset(token) + await model.log(trajectories=None, split="val", step=2, metrics={}) + + first_usage = first_response["usage"] + second_usage = second_response["usage"] + assert first_usage.get("cache_creation_input_tokens", 0) > 0 + assert second_usage.get("cache_read_input_tokens", 0) > 0 + + first_expected_cost = ( + (first_usage["input_tokens"] * 3.0) + + (first_usage.get("cache_creation_input_tokens", 0) * 3.75) + + (first_usage["output_tokens"] * 15.0) + ) / 1_000_000 + second_expected_cost = ( + (second_usage["input_tokens"] * 3.0) + + (second_usage.get("cache_read_input_tokens", 0) * 0.30) + + (second_usage["output_tokens"] * 15.0) + ) / 1_000_000 + + history_path = ( + tmp_path + / "live-api-cost" + / "models" + / "live-anthropic-api-cost" + / "history.jsonl" + ) + first_row, second_row = _history_rows(history_path) + + assert first_row["costs/eval/llm_judge/anthropic_prompt_cache"] == pytest.approx( + first_expected_cost + ) + assert second_row[ + "costs/eval/llm_judge/anthropic_prompt_cache" + ] == pytest.approx(second_expected_cost) From 84328cef802c633cd72d0fc61ce7797fccc96796 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 10:51:48 -0700 Subject: [PATCH 38/46] refactor: Rename API cost module --- src/art/{metrics_api_cost.py => api_costs.py} | 0 src/art/metrics.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename src/art/{metrics_api_cost.py => api_costs.py} (100%) diff --git a/src/art/metrics_api_cost.py b/src/art/api_costs.py similarity index 100% rename from src/art/metrics_api_cost.py rename to src/art/api_costs.py diff --git a/src/art/metrics.py b/src/art/metrics.py index d2ba6358..4a064fda 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -7,7 +7,7 @@ import time from typing import Any -from .metrics_api_cost import ( +from .api_costs import ( CostExtractor, ModelNameGetter, TokenPricing, @@ -403,4 +403,4 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None: result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds -from .metrics_api_cost import track_api_cost +from .api_costs import track_api_cost From 7c0a86f810df856e52206f877f139fde5d0aa53b Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 11:05:22 -0700 Subject: [PATCH 39/46] docs: Remove metrics taxonomy smoke example --- docs/metrics-taxonomy.md | 20 +---- examples/metrics_taxonomy_smoke.py | 120 ----------------------------- 2 files changed, 2 insertions(+), 138 deletions(-) delete mode 100644 examples/metrics_taxonomy_smoke.py diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md index bb9ee871..9c7c7b54 100644 --- a/docs/metrics-taxonomy.md +++ b/docs/metrics-taxonomy.md @@ -91,16 +91,6 @@ builder.add_idle_times(step_actor_idle_s=result.actor_idle_s) If these metrics are logged before the next `model.log(...)` flush, ART will also emit the cumulative and derived throughput metrics automatically. -## End-to-End Smoke Test - -Run: - -```bash -uv run python examples/metrics_taxonomy_smoke.py -``` - -This writes a local history file and, if `WANDB_API_KEY` is set, logs the same metrics to W&B. - ## API Cost Decorator (Phase 2/3) Use `@track_api_cost` to automatically write judge/API spend into `costs/{train|eval}/...`. @@ -123,17 +113,11 @@ async def run_judge(client, messages): Activate metric cost context while running train/eval logic: ```python -train_token = model.activate_metrics_context("train") -try: +with model.metrics_builder("train").activate_context(): await run_judge(client, train_messages) -finally: - train_token.var.reset(train_token) -eval_token = model.activate_metrics_context("eval") -try: +with model.metrics_builder("eval").activate_context(): await run_judge(client, eval_messages) -finally: - eval_token.var.reset(eval_token) ``` The next `model.log(...)` flush for that step will include: diff --git a/examples/metrics_taxonomy_smoke.py b/examples/metrics_taxonomy_smoke.py deleted file mode 100644 index ff4d4afe..00000000 --- a/examples/metrics_taxonomy_smoke.py +++ /dev/null @@ -1,120 +0,0 @@ -import asyncio -import json -import os -from pathlib import Path -import time - -import art -from art.metrics import track_api_cost - - -class _Usage: - def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: - self.prompt_tokens = prompt_tokens - self.completion_tokens = completion_tokens - - -class _Response: - def __init__(self, prompt_tokens: int, completion_tokens: int) -> None: - self.usage = _Usage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - ) - - -@track_api_cost( - source="llm_judge/decorator_demo", - provider="openai", - model_name="openai/gpt-oss-20b", -) -async def _mock_judge_call(step: int) -> _Response: - return _Response( - prompt_tokens=50 * step, - completion_tokens=20 * step, - ) - - -async def main() -> None: - project_spec = os.environ.get("ART_METRICS_PROJECT", "metrics-taxonomy-smoke") - entity = os.environ.get("ART_METRICS_ENTITY") - project = project_spec - if entity is None and "/" in project_spec: - split_entity, split_project = project_spec.split("/", 1) - if split_entity and split_project: - entity = split_entity - project = split_project - - model_name = os.environ.get( - "ART_METRICS_MODEL", f"metrics-smoke-{int(time.time())}" - ) - base_path = os.environ.get("ART_METRICS_BASE_PATH", ".art") - - model = art.Model( - name=model_name, - project=project, - entity=entity, - base_path=base_path, - report_metrics=["wandb"], - ) - - for step in (1, 2): - train_token = model.activate_metrics_context("train") - try: - await _mock_judge_call(step) - finally: - train_token.var.reset(train_token) - - trajectories = [ - art.TrajectoryGroup( - trajectories=[ - art.Trajectory( - reward=0.4 + 0.1 * step, - metrics={ - "judge_quality": 0.7 + 0.05 * step, - "reward/custom_prefixed": 0.2 * step, - }, - messages_and_choices=[ - {"role": "user", "content": f"smoke step {step}"}, - {"role": "assistant", "content": "ok"}, - ], - ) - ], - exceptions=[], - ) - ] - - await model.log( - trajectories, - split="train", - step=step, - metrics={ - "loss/train": 1.0 / step, - "loss/grad_norm": 0.5 + 0.1 * step, - "throughput/train_tok_per_sec": 1000.0 + 100.0 * step, - "time/step_wall_s": 1.5 + 0.2 * step, - "data/step_num_scenarios": 2.0, - "data/step_actor_tokens": 120.0 + 10.0 * step, - "costs_prefill": 0.10 * step, - "costs_sample": 0.05 * step, - "costs/train/llm_judge/correctness": 0.02 * step, - }, - ) - - history_path = Path(base_path) / project / "models" / model_name / "history.jsonl" - print(f"Wrote history: {history_path}") - - with open(history_path) as f: - rows = [json.loads(line) for line in f] - - print("\nLast row key excerpts:") - last = rows[-1] - show_prefixes = ("reward/", "loss/", "throughput/", "time/", "data/", "costs/") - for key in sorted(last): - if key.startswith(show_prefixes): - print(f"{key}: {last[key]}") - - print("\nIf WANDB_API_KEY is set, metrics are also logged to W&B.") - - -if __name__ == "__main__": - asyncio.run(main()) From 57644a01861294fceefd776de872dfd820c51215 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 11:05:32 -0700 Subject: [PATCH 40/46] refactor: Simplify metrics cost helpers --- src/art/api_costs.py | 52 ++++++++++++++++++++++---------------------- src/art/metrics.py | 13 ----------- 2 files changed, 26 insertions(+), 39 deletions(-) diff --git a/src/art/api_costs.py b/src/art/api_costs.py index a98332f8..957ab38b 100644 --- a/src/art/api_costs.py +++ b/src/art/api_costs.py @@ -249,6 +249,21 @@ def _estimate_anthropic_cost( ) +def _estimate_provider_cost( + provider_name: str | None, + response: Any, + pricing: TokenPricing, +) -> float | None: + if provider_name == OPENAI_PROVIDER: + return _estimate_openai_cost(_extract_openai_token_counts(response), pricing) + if provider_name == ANTHROPIC_PROVIDER: + return _estimate_anthropic_cost( + _extract_anthropic_token_counts(response), + pricing, + ) + return None + + def _strip_snapshot_suffix(model_name: str) -> str: for pattern in ( r"^(.*)-\d{4}-\d{2}-\d{2}$", @@ -466,32 +481,13 @@ def extract_api_cost( model_pricing: Mapping[str, TokenPricing], ) -> float | None: provider_name = normalize_provider(provider) or _detect_provider(response) - if provider_name is not None: - custom_extractor = cost_extractors.get(provider_name) - if custom_extractor is not None: - custom_cost = custom_extractor(response) - if custom_cost is not None: - return float(custom_cost) - - pricing = _resolve_token_pricing( - response, - provider=provider_name, - model_name=model_name, - model_name_getter=model_name_getter, - prompt_price_per_million=prompt_price_per_million, - completion_price_per_million=completion_price_per_million, - cached_prompt_price_per_million=cached_prompt_price_per_million, - cache_creation_price_per_million=cache_creation_price_per_million, - cache_read_price_per_million=cache_read_price_per_million, - model_pricing=model_pricing, - ) - if provider_name == OPENAI_PROVIDER: - return _estimate_openai_cost(_extract_openai_token_counts(response), pricing) - if provider_name == ANTHROPIC_PROVIDER: - return _estimate_anthropic_cost( - _extract_anthropic_token_counts(response), - pricing, - ) + custom_extractor = ( + cost_extractors.get(provider_name) if provider_name is not None else None + ) + if custom_extractor is not None: + custom_cost = custom_extractor(response) + if custom_cost is not None: + return float(custom_cost) pricing = _resolve_token_pricing( response, @@ -505,6 +501,10 @@ def extract_api_cost( cache_read_price_per_million=cache_read_price_per_million, model_pricing=model_pricing, ) + provider_cost = _estimate_provider_cost(provider_name, response, pricing) + if provider_cost is not None: + return provider_cost + openai_token_counts = _extract_openai_token_counts(response) if openai_token_counts is not None: return _estimate_openai_cost(openai_token_counts, pricing) diff --git a/src/art/metrics.py b/src/art/metrics.py index 4a064fda..7be67e08 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -182,8 +182,6 @@ def measure(self, key: str): async def flush(self) -> dict[str, float]: async with self._shared_state.lock: - self._validate_hierarchy() - result = dict(self._shared_state.step_buffer) cost_metrics = { key: value @@ -321,17 +319,6 @@ def _validate_and_add(self, key: str, value: float) -> None: self._shared_state.step_buffer.get(key, 0.0) + value ) - def _validate_hierarchy(self) -> None: - keys = sorted( - k for k in self._shared_state.step_buffer if k.startswith("costs/") - ) - for i, key in enumerate(keys): - for other in keys[i + 1 :]: - if other.startswith(f"{key}/"): - raise ValueError( - f"Leaf/parent conflict: '{key}' and '{other}' cannot coexist." - ) - def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]: if not cost_metrics: return {} From 401547bb693ed2b6f8544c086c86e3eee95e1596 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 11:47:47 -0700 Subject: [PATCH 41/46] refactor: Simplify metric taxonomy key handling --- docs/metrics-taxonomy.md | 46 ++++++++-------- src/art/local/backend.py | 2 - src/art/metrics_taxonomy.py | 52 ++----------------- .../binary_prefix_tool_pipeline.py | 2 +- src/art/pipeline_trainer/trainer.py | 3 ++ src/art/serverless/backend.py | 41 +++++++++++++-- src/art/tinker_native/backend.py | 33 ++++++++++-- src/art/unsloth/train.py | 40 ++++++++++++-- tests/unit/test_frontend_logging.py | 4 +- tests/unit/test_metrics_taxonomy.py | 29 +++++++++++ 10 files changed, 167 insertions(+), 85 deletions(-) diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md index 9c7c7b54..1da0b08f 100644 --- a/docs/metrics-taxonomy.md +++ b/docs/metrics-taxonomy.md @@ -12,28 +12,28 @@ Phase 1 introduces sectioned metric namespaces and hierarchical cost rollups. - `data/*` - `train/*`, `val/*`, `test/*` -## Train Key Mapping - -Current training code emits the following canonical keys: - -- `reward` -> `reward/mean` -- `reward_std_dev` -> `reward/std_dev` -- `exception_rate` -> `reward/exception_rate` -- `group_metric_` -> `reward/group_` -- `policy_loss` / `loss` -> `loss/train` -- `entropy` -> `loss/entropy` -- `kl_div` -> `loss/kl_div` -- `kl_policy_ref` -> `loss/kl_policy_ref` -- `grad_norm` -> `loss/grad_norm` -- `learning_rate` -> `loss/learning_rate` -- `tokens_per_second` -> `throughput/train_tok_per_sec` -- `num_groups_submitted` -> `train/num_groups_submitted` -- `num_groups_trainable` -> `train/num_groups_trainable` -- `num_trajectories` -> `train/num_trajectories` -- `num_trainable_tokens` -> `train/num_trainable_tokens` -- `train_tokens` -> `data/step_trainer_tokens` -- `num_datums` -> `data/step_num_datums` -- `num_gradient_steps` -> `data/step_num_gradient_steps` +## Backend Output + +ART backends emit canonical sectioned keys directly. The canonical training keys include: + +- `reward/mean` +- `reward/std_dev` +- `reward/exception_rate` +- `reward/group_` +- `loss/train` +- `loss/entropy` +- `loss/kl_div` +- `loss/kl_policy_ref` +- `loss/grad_norm` +- `loss/learning_rate` +- `throughput/train_tok_per_sec` +- `train/num_groups_submitted` +- `train/num_groups_trainable` +- `train/num_trajectories` +- `train/num_trainable_tokens` +- `data/step_trainer_tokens` +- `data/step_num_datums` +- `data/step_num_gradient_steps` ## Cost Rollups @@ -59,7 +59,7 @@ ART now emits the following metrics from library internals where the data is ava - `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer` - `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted` - `data/step_num_groups_trainable` for train splits -- `data/cum/num_unique_scenarios` when scenario IDs are present in group or trajectory metadata +- `data/cum/num_unique_scenarios` when `scenario_id` is present in group or trajectory metadata - `data/step_trainer_tokens` where the backend knows the trainer token count - `costs/gpu` on `LocalBackend` train-step logs when ART can resolve GPU pricing - `throughput/cum/trainer_idle_s`, `throughput/cum/actor_idle_s` diff --git a/src/art/local/backend.py b/src/art/local/backend.py index 807a0f42..d19aba32 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -48,7 +48,6 @@ TRAIN_GRADIENT_STEPS_KEY, average_metric_samples, build_training_summary_metrics, - rename_train_metrics, summarize_trajectory_groups, ) from ..model import Model, TrainableModel @@ -789,7 +788,6 @@ async def _train_model( async for result in service.train( disk_packed_tensors, config, dev_config, verbose ): - result = rename_train_metrics(result) num_gradient_steps = int( result.pop(TRAIN_GRADIENT_STEPS_KEY, estimated_gradient_steps) ) diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py index e4f9e713..6d8adcc4 100644 --- a/src/art/metrics_taxonomy.py +++ b/src/art/metrics_taxonomy.py @@ -6,46 +6,9 @@ from .trajectories import TrajectoryGroup TRAIN_GRADIENT_STEPS_KEY = "data/step_num_gradient_steps" - -_SCENARIO_ID_CANDIDATE_KEYS = ( - "scenario_id", - "scenario_scenario_id", - "scenario_idx", - "scenario_scenario_idx", -) - -TRAIN_METRIC_KEY_RENAMES = { - "reward": "reward/mean", - "reward_std_dev": "reward/std_dev", - "exception_rate": "reward/exception_rate", - "policy_loss": "loss/train", - "loss": "loss/train", - "entropy": "loss/entropy", - "kl_div": "loss/kl_div", - "kl_policy_ref": "loss/kl_policy_ref", - "grad_norm": "loss/grad_norm", - "learning_rate": "loss/learning_rate", - "tokens_per_second": "throughput/train_tok_per_sec", - "num_groups_submitted": "train/num_groups_submitted", - "num_groups_trainable": "train/num_groups_trainable", - "num_trajectories": "train/num_trajectories", - "num_trainable_tokens": "train/num_trainable_tokens", - "train_tokens": "data/step_trainer_tokens", - "num_datums": "data/step_num_datums", -} _INVARIANT_METRIC_KEYS = frozenset({TRAIN_GRADIENT_STEPS_KEY}) -def rename_train_metric_key(metric: str) -> str: - if metric.startswith("group_metric_"): - return f"reward/group_{metric[len('group_metric_'):]}" - return TRAIN_METRIC_KEY_RENAMES.get(metric, metric) - - -def rename_train_metrics(metrics: dict[str, float]) -> dict[str, float]: - return {rename_train_metric_key(key): float(value) for key, value in metrics.items()} - - def average_metric_samples(metric_samples: Iterable[dict[str, float]]) -> dict[str, float]: totals: dict[str, float] = {} counts: dict[str, int] = {} @@ -164,14 +127,7 @@ def _extract_scenario_id(group: TrajectoryGroup) -> str | None: def _extract_scenario_id_from_metadata( metadata: dict[str, Any], ) -> str | None: - for key in _SCENARIO_ID_CANDIDATE_KEYS: - value = metadata.get(key) - if value is not None: - return str(value) - - for key, value in metadata.items(): - if value is None: - continue - if key.endswith("scenario_id") or key.endswith("scenario_idx"): - return str(value) - return None + scenario_id = metadata.get("scenario_id") + if scenario_id is None: + return None + return str(scenario_id) diff --git a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py index bc2f5a04..66ed32f7 100644 --- a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py +++ b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py @@ -312,7 +312,7 @@ def build_scenario() -> Scenario: async def scenario_iter(): for i in range(scenario_count): scenario = build_scenario() - scenario["metadata"] = {"scenario_idx": i} + scenario["metadata"] = {"scenario_id": str(i)} yield scenario config = PipelineConfig( diff --git a/src/art/pipeline_trainer/trainer.py b/src/art/pipeline_trainer/trainer.py index a32ad1b8..5d569277 100644 --- a/src/art/pipeline_trainer/trainer.py +++ b/src/art/pipeline_trainer/trainer.py @@ -678,6 +678,9 @@ def _apply_scenario_metadata( continue if not self._is_scalar_metadata(value): continue + if key == "scenario_id": + group.metadata["scenario_id"] = value + continue group.metadata[f"scenario_{key}"] = value def _is_group_stale(self, group: TrajectoryGroup, min_version: int) -> bool: diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py index d0589f7f..3c8c186e 100644 --- a/src/art/serverless/backend.py +++ b/src/art/serverless/backend.py @@ -14,7 +14,6 @@ TRAIN_GRADIENT_STEPS_KEY, average_metric_samples, build_training_summary_metrics, - rename_train_metrics, summarize_trajectory_groups, ) from ..trajectories import Trajectory, TrajectoryGroup @@ -38,6 +37,42 @@ def _extract_step_from_wandb_artifact(artifact: "wandb.Artifact") -> int | None: return None +_UPSTREAM_TRAIN_METRIC_KEYS = { + "reward": "reward/mean", + "reward_std_dev": "reward/std_dev", + "exception_rate": "reward/exception_rate", + "policy_loss": "loss/train", + "loss": "loss/train", + "entropy": "loss/entropy", + "kl_div": "loss/kl_div", + "kl_policy_ref": "loss/kl_policy_ref", + "grad_norm": "loss/grad_norm", + "learning_rate": "loss/learning_rate", + "tokens_per_second": "throughput/train_tok_per_sec", + "num_groups_submitted": "train/num_groups_submitted", + "num_groups_trainable": "train/num_groups_trainable", + "num_trajectories": "train/num_trajectories", + "num_trainable_tokens": "train/num_trainable_tokens", + "train_tokens": "data/step_trainer_tokens", + "num_datums": "data/step_num_datums", +} + + +def _canonicalize_upstream_metric_key(metric: str) -> str: + if "/" in metric: + return metric + if metric.startswith("group_metric_"): + return f"reward/group_{metric[len('group_metric_'):]}" + return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) + + +def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]: + return { + _canonicalize_upstream_metric_key(key): float(value) + for key, value in metrics.items() + } + + class ServerlessBackend(Backend): def __init__( self, *, api_key: str | None = None, base_url: str | None = None @@ -328,7 +363,7 @@ async def _train_model( assert pbar is not None and num_sequences is not None pbar.update(1) pbar.set_postfix(event.data) - metrics = rename_train_metrics( + metrics = _canonicalize_upstream_metrics( {k: float(v) for k, v in event.data.items()} ) yield { @@ -500,7 +535,7 @@ async def _train_sft( assert pbar is not None and num_batches is not None pbar.update(1) pbar.set_postfix(event.data) - metrics = rename_train_metrics( + metrics = _canonicalize_upstream_metrics( {k: float(v) for k, v in event.data.items()} ) yield { diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py index aeb41e1c..f8c09b25 100644 --- a/src/art/tinker_native/backend.py +++ b/src/art/tinker_native/backend.py @@ -32,7 +32,6 @@ from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing from ..metrics_taxonomy import ( build_training_summary_metrics, - rename_train_metric_key, summarize_trajectory_groups, ) from ..model import Model, TrainableModel @@ -52,6 +51,34 @@ STATE_KEY_LATEST_STEP = "latest_step" T = TypeVar("T") +_UPSTREAM_TRAIN_METRIC_KEYS = { + "reward": "reward/mean", + "reward_std_dev": "reward/std_dev", + "exception_rate": "reward/exception_rate", + "policy_loss": "loss/train", + "loss": "loss/train", + "entropy": "loss/entropy", + "kl_div": "loss/kl_div", + "kl_policy_ref": "loss/kl_policy_ref", + "grad_norm": "loss/grad_norm", + "learning_rate": "loss/learning_rate", + "tokens_per_second": "throughput/train_tok_per_sec", + "num_groups_submitted": "train/num_groups_submitted", + "num_groups_trainable": "train/num_groups_trainable", + "num_trajectories": "train/num_trajectories", + "num_trainable_tokens": "train/num_trainable_tokens", + "train_tokens": "data/step_trainer_tokens", + "num_datums": "data/step_num_datums", +} + + +def _canonicalize_upstream_metric_key(metric: str) -> str: + if "/" in metric: + return metric + if metric.startswith("group_metric_"): + return f"reward/group_{metric[len('group_metric_'):]}" + return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) + @dataclass class ModelState: @@ -280,12 +307,12 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum: for key, value in forward_output.metrics.items(): if value is None: continue - metrics[rename_train_metric_key(key)] = float(value) + metrics[_canonicalize_upstream_metric_key(key)] = float(value) if optim_output.metrics: for key, value in optim_output.metrics.items(): if value is None: continue - metrics[rename_train_metric_key(key)] = float(value) + metrics[_canonicalize_upstream_metric_key(key)] = float(value) next_step = state.current_step + 1 checkpoint_name = f"step_{next_step:06d}" diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py index 4b8d15d7..4505215a 100644 --- a/src/art/unsloth/train.py +++ b/src/art/unsloth/train.py @@ -12,7 +12,6 @@ from .. import dev from ..loss import loss_fn, shift_tensor -from ..metrics_taxonomy import rename_train_metric_key, rename_train_metrics from ..types import TrainConfig if TYPE_CHECKING: @@ -20,6 +19,41 @@ nest_asyncio.apply() +_UPSTREAM_TRAIN_METRIC_KEYS = { + "reward": "reward/mean", + "reward_std_dev": "reward/std_dev", + "exception_rate": "reward/exception_rate", + "policy_loss": "loss/train", + "loss": "loss/train", + "entropy": "loss/entropy", + "kl_div": "loss/kl_div", + "kl_policy_ref": "loss/kl_policy_ref", + "grad_norm": "loss/grad_norm", + "learning_rate": "loss/learning_rate", + "tokens_per_second": "throughput/train_tok_per_sec", + "num_groups_submitted": "train/num_groups_submitted", + "num_groups_trainable": "train/num_groups_trainable", + "num_trajectories": "train/num_trajectories", + "num_trainable_tokens": "train/num_trainable_tokens", + "train_tokens": "data/step_trainer_tokens", + "num_datums": "data/step_num_datums", +} + + +def _canonicalize_upstream_metric_key(metric: str) -> str: + if "/" in metric: + return metric + if metric.startswith("group_metric_"): + return f"reward/group_{metric[len('group_metric_'):]}" + return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) + + +def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]: + return { + _canonicalize_upstream_metric_key(key): float(value) + for key, value in metrics.items() + } + async def train( trainer: "GRPOTrainer", @@ -198,12 +232,12 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None: if next(iter(logs.keys())).startswith("eval_"): normalized_metrics = {f"val/{key}": val for key, val in metrics.items()} normalized_logs = { - f"val/{rename_train_metric_key(key[len('eval_') :])}": val + f"val/{_canonicalize_upstream_metric_key(key[len('eval_') :])}": val for key, val in logs.items() } results_queue.put_nowait({**normalized_metrics, **normalized_logs}) else: - results_queue.put_nowait({**rename_train_metrics(logs), **metrics}) + results_queue.put_nowait({**_canonicalize_upstream_metrics(logs), **metrics}) trainer._metrics["train"].clear() return log diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index c0adcb40..c4b2bf9f 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -589,7 +589,7 @@ async def test_train_logs_add_default_data_metrics_from_trajectory_groups( messages_and_choices=[{"role": "user", "content": "b"}], ), ], - metadata={"scenario_scenario_id": "scenario-1"}, + metadata={"scenario_id": "scenario-1"}, ), TrajectoryGroup( trajectories=[ @@ -599,7 +599,7 @@ async def test_train_logs_add_default_data_metrics_from_trajectory_groups( ) ], exceptions=[], - metadata={"scenario_scenario_id": "scenario-2"}, + metadata={"scenario_id": "scenario-2"}, ), ] diff --git a/tests/unit/test_metrics_taxonomy.py b/tests/unit/test_metrics_taxonomy.py index 7d22ddf2..b2eaadc0 100644 --- a/tests/unit/test_metrics_taxonomy.py +++ b/tests/unit/test_metrics_taxonomy.py @@ -1,10 +1,12 @@ import pytest +from art import Trajectory, TrajectoryGroup from art.metrics_taxonomy import ( TRAIN_GRADIENT_STEPS_KEY, TrajectoryBatchSummary, average_metric_samples, build_training_summary_metrics, + summarize_trajectory_groups, ) @@ -49,3 +51,30 @@ def test_average_metric_samples_requires_invariant_gradient_step_count() -> None {TRAIN_GRADIENT_STEPS_KEY: 3.0}, ] ) + + +def test_summarize_trajectory_groups_only_counts_explicit_scenario_id() -> None: + summary = summarize_trajectory_groups( + [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=1.0, + messages_and_choices=[{"role": "user", "content": "a"}], + ) + ], + metadata={"scenario_id": "scenario-1"}, + ), + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.0, + messages_and_choices=[{"role": "user", "content": "b"}], + ) + ], + metadata={"scenario_scenario_id": "legacy-scenario"}, + ), + ] + ) + + assert summary.scenario_ids == ["scenario-1"] From 92384a3f4cd611fa038724b2694a5992e747500b Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 12:43:17 -0700 Subject: [PATCH 42/46] refactor: Canonicalize cost and throughput keys --- docs/metrics-taxonomy.md | 4 +-- src/art/costs.py | 15 ++++++++--- src/art/model.py | 25 +++++++------------ .../binary_prefix_tool_pipeline.py | 9 ++++--- src/art/serverless/backend.py | 6 +++-- src/art/tinker_native/backend.py | 11 +++++--- src/art/unsloth/service.py | 1 - src/art/unsloth/train.py | 6 +++-- tests/unit/test_frontend_logging.py | 10 ++++---- tests/unit/test_track_api_cost.py | 3 ++- 10 files changed, 50 insertions(+), 40 deletions(-) diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md index 1da0b08f..72603c68 100644 --- a/docs/metrics-taxonomy.md +++ b/docs/metrics-taxonomy.md @@ -26,7 +26,6 @@ ART backends emit canonical sectioned keys directly. The canonical training keys - `loss/kl_policy_ref` - `loss/grad_norm` - `loss/learning_rate` -- `throughput/train_tok_per_sec` - `train/num_groups_submitted` - `train/num_groups_trainable` - `train/num_trajectories` @@ -37,10 +36,9 @@ ART backends emit canonical sectioned keys directly. The canonical training keys ## Cost Rollups -Cost leaves can be logged with either: +Cost leaves can be logged with hierarchical keys, for example: - hierarchical keys, e.g. `costs/train/llm_judge/correctness` -- legacy component keys, e.g. `costs_prefill`, `costs_sample` ART rolls costs up automatically: diff --git a/src/art/costs.py b/src/art/costs.py index 5ee5523a..e3e2b2b4 100644 --- a/src/art/costs.py +++ b/src/art/costs.py @@ -16,7 +16,7 @@ class ModelPricing: TokenCount: TypeAlias = int | None -CostCalculator: TypeAlias = Callable[[TokenCount, TokenCount], dict[str, float]] +CostCalculator: TypeAlias = Callable[[TokenCount, TokenCount, str], dict[str, float]] # Pricing per model ($/1M tokens). Keep in sync with infra pricing. MODEL_PRICING: dict[str, ModelPricing] = { @@ -88,16 +88,20 @@ def compute_sample_costs( *, prompt_tokens: int | None, completion_tokens: int | None, + cost_context: str, pricing: ModelPricing, ) -> dict[str, float]: """Compute prompt+completion costs for a single API call.""" + normalized_context = cost_context.strip("/") + if not normalized_context: + raise ValueError("cost_context must be non-empty") prompt_value = float(prompt_tokens or 0) completion_value = float(completion_tokens or 0) prefill_cost = tokens_to_cost(prompt_value, pricing.prefill) sample_cost = tokens_to_cost(completion_value, pricing.sample) return { - "costs_prefill": prefill_cost, - "costs_sample": sample_cost, + f"costs/{normalized_context}/prefill": prefill_cost, + f"costs/{normalized_context}/sample": sample_cost, } @@ -105,11 +109,14 @@ def build_cost_calculator(pricing: ModelPricing) -> CostCalculator: """Return a callable that computes prompt+completion costs for a request.""" def _calculator( - prompt_tokens: int | None, completion_tokens: int | None + prompt_tokens: int | None, + completion_tokens: int | None, + cost_context: str, ) -> dict[str, float]: return compute_sample_costs( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, + cost_context=cost_context, pricing=pricing, ) diff --git a/src/art/model.py b/src/art/model.py index b4d7bdef..13f8ed1a 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -36,8 +36,6 @@ ModelConfig = TypeVar("ModelConfig", bound=BaseModel | None) StateType = TypeVar("StateType", bound=dict[str, Any], default=dict[str, Any]) -COSTS_METRIC_PREFIX = "costs_" -COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total" METRICS_BUILDER_STATE_KEY = "_metrics_builder_state" METRIC_SECTIONS = frozenset( { @@ -505,24 +503,17 @@ def _route_metrics_and_collect_non_costs( self, metrics: dict[str, float], split: str ) -> dict[str, float]: non_cost_metrics: dict[str, float] = {} - cost_context = "train" if split == "train" else "eval" for metric, value in metrics.items(): numeric_value = float(value) - if metric == COSTS_TOTAL_KEY: - raise ValueError( - "Do not log 'costs_total' directly. Log costs_* components " - "(e.g., costs_prefill, costs_sample) and totals are derived." - ) if metric.startswith("costs/"): self._metrics_builder.add_cost(metric[len("costs/") :], numeric_value) continue - if metric.startswith(COSTS_METRIC_PREFIX): - component = metric[len(COSTS_METRIC_PREFIX) :] - if component: - self._metrics_builder.add_cost( - f"{cost_context}/{component}", numeric_value - ) - continue + if metric.startswith("costs_"): + raise ValueError( + "Legacy cost keys like 'costs_prefill' are no longer supported. " + "Log hierarchical costs like 'costs/train/prefill' or " + "'costs/eval/prefill' instead." + ) if is_builder_managed_metric(metric): self._metrics_builder.add_metric(metric, numeric_value) continue @@ -878,7 +869,9 @@ def set_cost_calculator(self, calculator: CostCalculator | None) -> None: @staticmethod def _noop_cost_calculator( - _prompt_tokens: int | None, _completion_tokens: int | None + _prompt_tokens: int | None, + _completion_tokens: int | None, + _cost_context: str, ) -> dict[str, float]: return {} diff --git a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py index 66ed32f7..f9593c24 100644 --- a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py +++ b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py @@ -229,7 +229,9 @@ async def main() -> None: openai_client = model.openai_client() cost_calculator = model.cost_calculator - async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory: + async def do_rollout( + scenario: Scenario, temp: float, cost_context: str + ) -> art.Trajectory: """Core rollout logic used by both training and eval.""" messages: art.Messages = scenario["messages"] response = await openai_client.chat.completions.create( @@ -265,6 +267,7 @@ async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory: sample_costs = cost_calculator( prompt_tokens, completion_tokens, + cost_context, ) if sample_costs: metrics.update(sample_costs) @@ -281,7 +284,7 @@ async def single_rollout( scenario: Scenario, _config: PipelineConfig, ) -> art.Trajectory: - return await do_rollout(scenario, temperature) + return await do_rollout(scenario, temperature, "train") rollout_fn = make_group_rollout_fn(single_rollout, n=rollouts_per_scenario) @@ -290,7 +293,7 @@ async def single_rollout( async def eval_fn( _model: art.TrainableModel, _step: int, _config: PipelineConfig ) -> list[art.Trajectory]: - tasks = [do_rollout(build_scenario(), eval_temperature)] + tasks = [do_rollout(build_scenario(), eval_temperature, "eval")] results = await asyncio.gather(*tasks, return_exceptions=True) trajectories = [r for r in results if isinstance(r, art.Trajectory)] if trajectories: diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py index 3c8c186e..a28c0127 100644 --- a/src/art/serverless/backend.py +++ b/src/art/serverless/backend.py @@ -48,7 +48,6 @@ def _extract_step_from_wandb_artifact(artifact: "wandb.Artifact") -> int | None: "kl_policy_ref": "loss/kl_policy_ref", "grad_norm": "loss/grad_norm", "learning_rate": "loss/learning_rate", - "tokens_per_second": "throughput/train_tok_per_sec", "num_groups_submitted": "train/num_groups_submitted", "num_groups_trainable": "train/num_groups_trainable", "num_trajectories": "train/num_trajectories", @@ -61,6 +60,8 @@ def _extract_step_from_wandb_artifact(artifact: "wandb.Artifact") -> int | None: def _canonicalize_upstream_metric_key(metric: str) -> str: if "/" in metric: return metric + if metric == "tokens_per_second": + return "" if metric.startswith("group_metric_"): return f"reward/group_{metric[len('group_metric_'):]}" return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) @@ -68,8 +69,9 @@ def _canonicalize_upstream_metric_key(metric: str) -> str: def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]: return { - _canonicalize_upstream_metric_key(key): float(value) + canonical_key: float(value) for key, value in metrics.items() + if (canonical_key := _canonicalize_upstream_metric_key(key)) } diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py index f8c09b25..9d234944 100644 --- a/src/art/tinker_native/backend.py +++ b/src/art/tinker_native/backend.py @@ -62,7 +62,6 @@ "kl_policy_ref": "loss/kl_policy_ref", "grad_norm": "loss/grad_norm", "learning_rate": "loss/learning_rate", - "tokens_per_second": "throughput/train_tok_per_sec", "num_groups_submitted": "train/num_groups_submitted", "num_groups_trainable": "train/num_groups_trainable", "num_trajectories": "train/num_trajectories", @@ -75,6 +74,8 @@ def _canonicalize_upstream_metric_key(metric: str) -> str: if "/" in metric: return metric + if metric == "tokens_per_second": + return "" if metric.startswith("group_metric_"): return f"reward/group_{metric[len('group_metric_'):]}" return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) @@ -307,12 +308,16 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum: for key, value in forward_output.metrics.items(): if value is None: continue - metrics[_canonicalize_upstream_metric_key(key)] = float(value) + canonical_key = _canonicalize_upstream_metric_key(key) + if canonical_key: + metrics[canonical_key] = float(value) if optim_output.metrics: for key, value in optim_output.metrics.items(): if value is None: continue - metrics[_canonicalize_upstream_metric_key(key)] = float(value) + canonical_key = _canonicalize_upstream_metric_key(key) + if canonical_key: + metrics[canonical_key] = float(value) next_step = state.current_step + 1 checkpoint_name = f"step_{next_step:06d}" diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index f3a69179..cb3e3115 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -849,7 +849,6 @@ async def train_sft( "loss/grad_norm": grad_norm, "train/num_trajectories": float(batch.num_trajectories), "train/num_trainable_tokens": float(batch.num_trainable_tokens), - "throughput/train_tok_per_sec": tokens_per_second, } # === Cleanup === diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py index 4505215a..8798910b 100644 --- a/src/art/unsloth/train.py +++ b/src/art/unsloth/train.py @@ -30,7 +30,6 @@ "kl_policy_ref": "loss/kl_policy_ref", "grad_norm": "loss/grad_norm", "learning_rate": "loss/learning_rate", - "tokens_per_second": "throughput/train_tok_per_sec", "num_groups_submitted": "train/num_groups_submitted", "num_groups_trainable": "train/num_groups_trainable", "num_trajectories": "train/num_trajectories", @@ -43,6 +42,8 @@ def _canonicalize_upstream_metric_key(metric: str) -> str: if "/" in metric: return metric + if metric == "tokens_per_second": + return "" if metric.startswith("group_metric_"): return f"reward/group_{metric[len('group_metric_'):]}" return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) @@ -50,8 +51,9 @@ def _canonicalize_upstream_metric_key(metric: str) -> str: def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]: return { - _canonicalize_upstream_metric_key(key): float(value) + canonical_key: float(value) for key, value in metrics.items() + if (canonical_key := _canonicalize_upstream_metric_key(key)) } diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index c4b2bf9f..1f65880d 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -636,8 +636,8 @@ async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path): split="train", step=1, metrics={ - "costs_prefill": 0.2, - "costs_sample": 0.3, + "costs/train/prefill": 0.2, + "costs/train/sample": 0.3, }, ) await model.log( @@ -645,7 +645,7 @@ async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path): split="train", step=2, metrics={ - "costs_prefill": 0.1, + "costs/train/prefill": 0.1, }, ) @@ -679,7 +679,7 @@ async def test_cost_cumulative_persists_across_model_recreation( trajectories=None, split="train", step=1, - metrics={"costs_prefill": 0.25}, + metrics={"costs/train/prefill": 0.25}, ) model_2 = Model( @@ -692,7 +692,7 @@ async def test_cost_cumulative_persists_across_model_recreation( trajectories=None, split="train", step=2, - metrics={"costs_prefill": 0.75}, + metrics={"costs/train/prefill": 0.75}, ) history_path = tmp_path / "test/models/test/history.jsonl" diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py index c5f951cc..675b1028 100644 --- a/tests/unit/test_track_api_cost.py +++ b/tests/unit/test_track_api_cost.py @@ -215,10 +215,11 @@ async def _judge() -> _OpenAIResponse: expected = compute_sample_costs( prompt_tokens=1_000, completion_tokens=2_000, + cost_context="train", pricing=pricing, ) assert metrics["costs/train/llm_judge/global_pricing"] == pytest.approx( - expected["costs_prefill"] + expected["costs_sample"] + expected["costs/train/prefill"] + expected["costs/train/sample"] ) @pytest.mark.asyncio From 3b943dc5862bf1cca0fef2602fdee9237e586360 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 13:54:09 -0700 Subject: [PATCH 43/46] refactor: Require explicit API cost provider and model --- docs/metrics-taxonomy.md | 6 +- src/art/api_costs.py | 208 +++++++---------------------- src/art/metrics.py | 10 +- tests/unit/test_metrics_builder.py | 2 + tests/unit/test_track_api_cost.py | 46 +++++-- 5 files changed, 97 insertions(+), 175 deletions(-) diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md index 72603c68..3bd28a11 100644 --- a/docs/metrics-taxonomy.md +++ b/docs/metrics-taxonomy.md @@ -130,8 +130,10 @@ Built-in usage extraction: - Anthropic usage (`input_tokens`, `output_tokens`) Pricing is model-aware by default. ART will use the configured model pricing from -`art.costs.MODEL_PRICING` when it can resolve a concrete model name, and it -raises instead of guessing when pricing is missing. +`art.costs.MODEL_PRICING` and `art.api_costs.MODEL_TOKEN_PRICING` for an exact +`model_name` match, and it raises instead of guessing when pricing is missing. +`provider` and `model_name` are required on `@track_api_cost`; ART no longer +infers them from the response payload. You can still override pricing per decorator call or register model-specific pricing on the builder: diff --git a/src/art/api_costs.py b/src/art/api_costs.py index 957ab38b..37b82b7b 100644 --- a/src/art/api_costs.py +++ b/src/art/api_costs.py @@ -4,7 +4,6 @@ from dataclasses import dataclass from functools import wraps from inspect import iscoroutinefunction -import re from typing import Any, ParamSpec, TypeVar from .costs import get_model_pricing, tokens_to_cost @@ -16,7 +15,6 @@ R = TypeVar("R") CostExtractor = Callable[[Any], float | None] -ModelNameGetter = Callable[[Any], str | None] ResponseGetter = Callable[[Any], Any] @@ -44,7 +42,7 @@ class _AnthropicTokenUsage: cache_read_input_tokens: float -_DEFAULT_TOKEN_PRICING: dict[str, TokenPricing] = { +MODEL_TOKEN_PRICING: dict[str, TokenPricing] = { "openai/gpt-4.1": TokenPricing( prompt_per_million=2.0, completion_per_million=8.0, @@ -59,8 +57,8 @@ class _AnthropicTokenUsage: } -def _default_token_pricing(model_name: str) -> TokenPricing | None: - explicit = _DEFAULT_TOKEN_PRICING.get(model_name) +def _configured_token_pricing(model_name: str) -> TokenPricing | None: + explicit = MODEL_TOKEN_PRICING.get(model_name) if explicit is not None: return explicit @@ -72,6 +70,7 @@ def _default_token_pricing(model_name: str) -> TokenPricing | None: completion_per_million=pricing.sample, ) + def normalize_provider(provider: str | None) -> str | None: if provider is None: return None @@ -113,17 +112,6 @@ def _response_usage(response: Any) -> Any: return getattr(response, "usage", None) -def _response_model_name(response: Any) -> str | None: - if isinstance(response, dict): - value = response.get("model") - else: - value = getattr(response, "model", None) - if value is None: - return None - normalized = str(value).strip() - return normalized or None - - def _extract_openai_token_counts(response: Any) -> _OpenAITokenUsage | None: usage = _response_usage(response) prompt_tokens = _read_usage_field(usage, "prompt_tokens") @@ -170,24 +158,6 @@ def _extract_anthropic_token_counts(response: Any) -> _AnthropicTokenUsage | Non ) -def _detect_provider(response: Any) -> str | None: - usage = _response_usage(response) - if usage is None: - return None - - if ( - _read_usage_field(usage, "prompt_tokens") is not None - or _read_usage_field(usage, "completion_tokens") is not None - ): - return OPENAI_PROVIDER - if ( - _read_usage_field(usage, "input_tokens") is not None - or _read_usage_field(usage, "output_tokens") is not None - ): - return ANTHROPIC_PROVIDER - return None - - def _estimate_openai_cost( token_counts: _OpenAITokenUsage | None, pricing: TokenPricing, @@ -250,7 +220,7 @@ def _estimate_anthropic_cost( def _estimate_provider_cost( - provider_name: str | None, + provider_name: str, response: Any, pricing: TokenPricing, ) -> float | None: @@ -260,42 +230,10 @@ def _estimate_provider_cost( return _estimate_anthropic_cost( _extract_anthropic_token_counts(response), pricing, - ) + ) return None -def _strip_snapshot_suffix(model_name: str) -> str: - for pattern in ( - r"^(.*)-\d{4}-\d{2}-\d{2}$", - r"^(.*)-\d{8}$", - ): - match = re.match(pattern, model_name) - if match is not None: - return match.group(1) - return model_name - - -def _candidate_model_names( - normalized_model_name: str, - *, - provider: str | None, -) -> list[str]: - candidates: list[str] = [] - - def _append(candidate: str | None) -> None: - if candidate and candidate not in candidates: - candidates.append(candidate) - - _append(normalized_model_name) - _append(_strip_snapshot_suffix(normalized_model_name)) - - if provider is not None and "/" not in normalized_model_name: - _append(f"{provider}/{normalized_model_name}") - _append(f"{provider}/{_strip_snapshot_suffix(normalized_model_name)}") - - return candidates - - def _resolve_registered_or_default_pricing( model_name: str, *, @@ -304,7 +242,7 @@ def _resolve_registered_or_default_pricing( registered = model_pricing.get(model_name) if registered is not None: return registered - return _default_token_pricing(model_name) + return _configured_token_pricing(model_name) def _merge_token_pricing( @@ -345,52 +283,19 @@ def _merge_token_pricing( ) -def _resolve_model_name( - response: Any, - *, - provider: str | None, - model_name: str | None, - model_name_getter: ModelNameGetter | None, - model_pricing: Mapping[str, TokenPricing], -) -> str | None: - explicit_model_name = model_name.strip() if model_name is not None else None - if explicit_model_name: - candidate = explicit_model_name - elif model_name_getter is not None: - candidate = model_name_getter(response) - else: - candidate = _response_model_name(response) - - if candidate is None: +def normalize_model_name(model_name: str | None) -> str | None: + if model_name is None: return None - - normalized_model_name = str(candidate).strip() - if not normalized_model_name: + normalized = model_name.strip() + if not normalized: return None - - normalized_provider = normalize_provider(provider) - candidates = _candidate_model_names( - normalized_model_name, - provider=normalized_provider, - ) - for candidate in candidates: - if _resolve_registered_or_default_pricing( - candidate, - model_pricing=model_pricing, - ) is not None: - return candidate - - if normalized_provider is not None and "/" not in normalized_model_name: - return f"{normalized_provider}/{normalized_model_name}" - return normalized_model_name + return normalized def _resolve_token_pricing( - response: Any, *, - provider: str | None, - model_name: str | None, - model_name_getter: ModelNameGetter | None, + provider: str, + model_name: str, prompt_price_per_million: float | None, completion_price_per_million: float | None, cached_prompt_price_per_million: float | None, @@ -424,36 +329,22 @@ def _resolve_token_pricing( else None ) - resolved_model_name = _resolve_model_name( - response, - provider=provider, - model_name=model_name, - model_name_getter=model_name_getter, - model_pricing=model_pricing, - ) - if resolved_model_name is None: - if explicit_prompt_price is not None and explicit_completion_price is not None: - return TokenPricing( - prompt_per_million=explicit_prompt_price, - completion_per_million=explicit_completion_price, - cached_prompt_per_million=explicit_cached_prompt_price, - cache_creation_per_million=explicit_cache_creation_price, - cache_read_per_million=explicit_cache_read_price, - ) - raise ValueError( - "API cost tracking requires model-aware pricing. " - "Provide both explicit token prices or supply a model_name " - "(or response.model / model_name_getter) with configured pricing." - ) + if normalize_provider(provider) is None: + raise ValueError("provider must be non-empty") + + normalized_model_name = normalize_model_name(model_name) + if normalized_model_name is None: + raise ValueError("model_name must be non-empty") configured_pricing = _resolve_registered_or_default_pricing( - resolved_model_name, + normalized_model_name, model_pricing=model_pricing, ) if configured_pricing is None: raise ValueError( - f"No pricing configured for model '{resolved_model_name}'. " - "Provide explicit token prices or register model pricing." + f"No pricing configured for model '{normalized_model_name}'. " + "Add it to art.api_costs.MODEL_TOKEN_PRICING, art.costs.MODEL_PRICING, " + "or register it with MetricsBuilder.register_model_pricing()." ) return _merge_token_pricing( @@ -469,9 +360,8 @@ def _resolve_token_pricing( def extract_api_cost( response: Any, *, - provider: str | None, - model_name: str | None, - model_name_getter: ModelNameGetter | None, + provider: str, + model_name: str, prompt_price_per_million: float | None, completion_price_per_million: float | None, cached_prompt_price_per_million: float | None, @@ -480,20 +370,19 @@ def extract_api_cost( cost_extractors: Mapping[str, CostExtractor], model_pricing: Mapping[str, TokenPricing], ) -> float | None: - provider_name = normalize_provider(provider) or _detect_provider(response) - custom_extractor = ( - cost_extractors.get(provider_name) if provider_name is not None else None - ) + provider_name = normalize_provider(provider) + if provider_name is None: + raise ValueError("provider must be non-empty") + + custom_extractor = cost_extractors.get(provider_name) if custom_extractor is not None: custom_cost = custom_extractor(response) if custom_cost is not None: return float(custom_cost) pricing = _resolve_token_pricing( - response, provider=provider_name, model_name=model_name, - model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, cached_prompt_price_per_million=cached_prompt_price_per_million, @@ -505,21 +394,23 @@ def extract_api_cost( if provider_cost is not None: return provider_cost - openai_token_counts = _extract_openai_token_counts(response) - if openai_token_counts is not None: - return _estimate_openai_cost(openai_token_counts, pricing) - anthropic_token_counts = _extract_anthropic_token_counts(response) - return _estimate_anthropic_cost(anthropic_token_counts, pricing) + if provider_name in {OPENAI_PROVIDER, ANTHROPIC_PROVIDER}: + raise ValueError( + f"Response usage does not match provider '{provider_name}'. " + "Pass the correct provider/model pair or register a custom cost extractor." + ) + raise ValueError( + f"No cost extractor registered for provider '{provider_name}'." + ) def _record_api_cost( *, result: Any, source: str, - provider: str | None, + provider: str, response_getter: ResponseGetter | None, - model_name: str | None, - model_name_getter: ModelNameGetter | None, + model_name: str, prompt_price_per_million: float | None, completion_price_per_million: float | None, cached_prompt_price_per_million: float | None, @@ -539,7 +430,6 @@ def _record_api_cost( response, provider=provider, model_name=model_name, - model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, cached_prompt_price_per_million=cached_prompt_price_per_million, @@ -551,9 +441,8 @@ def _record_api_cost( def track_api_cost( *, source: str, - provider: str | None = None, - model_name: str | None = None, - model_name_getter: ModelNameGetter | None = None, + provider: str, + model_name: str, response_getter: ResponseGetter | None = None, prompt_price_per_million: float | None = None, completion_price_per_million: float | None = None, @@ -566,6 +455,11 @@ def track_api_cost( raise ValueError("source must be non-empty") normalized_provider = normalize_provider(provider) + if normalized_provider is None: + raise ValueError("provider must be non-empty") + normalized_model_name = normalize_model_name(model_name) + if normalized_model_name is None: + raise ValueError("model_name must be non-empty") def _decorate(func: Callable[P, R]) -> Callable[P, R]: if iscoroutinefunction(func): @@ -578,8 +472,7 @@ async def _async_wrapper(*args: P.args, **kwargs: P.kwargs): source=normalized_source, provider=normalized_provider, response_getter=response_getter, - model_name=model_name, - model_name_getter=model_name_getter, + model_name=normalized_model_name, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, cached_prompt_price_per_million=cached_prompt_price_per_million, @@ -598,8 +491,7 @@ def _sync_wrapper(*args: P.args, **kwargs: P.kwargs): source=normalized_source, provider=normalized_provider, response_getter=response_getter, - model_name=model_name, - model_name_getter=model_name_getter, + model_name=normalized_model_name, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, cached_prompt_price_per_million=cached_prompt_price_per_million, diff --git a/src/art/metrics.py b/src/art/metrics.py index 7be67e08..11d1e4d6 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -9,9 +9,9 @@ from .api_costs import ( CostExtractor, - ModelNameGetter, TokenPricing, extract_api_cost, + normalize_model_name, normalize_provider, ) @@ -94,9 +94,8 @@ def add_response_cost( source: str, response: Any, *, - provider: str | None = None, - model_name: str | None = None, - model_name_getter: "ModelNameGetter | None" = None, + provider: str, + model_name: str, prompt_price_per_million: float | None = None, completion_price_per_million: float | None = None, cached_prompt_price_per_million: float | None = None, @@ -111,7 +110,6 @@ def add_response_cost( response, provider=provider, model_name=model_name, - model_name_getter=model_name_getter, prompt_price_per_million=prompt_price_per_million, completion_price_per_million=completion_price_per_million, cached_prompt_price_per_million=cached_prompt_price_per_million, @@ -256,7 +254,7 @@ def register_model_pricing( cache_creation_per_million: float | None = None, cache_read_per_million: float | None = None, ) -> None: - normalized_model_name = model_name.strip() + normalized_model_name = normalize_model_name(model_name) if not normalized_model_name: raise ValueError("model_name must be non-empty") self._shared_state.model_pricing[normalized_model_name] = TokenPricing( diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py index 57def527..5031df14 100644 --- a/tests/unit/test_metrics_builder.py +++ b/tests/unit/test_metrics_builder.py @@ -197,6 +197,8 @@ async def test_add_response_cost_uses_registered_model_pricing(self) -> None: "model": "anthropic/test-judge", "usage": {"input_tokens": 40, "output_tokens": 60}, }, + provider="anthropic", + model_name="anthropic/test-judge", ) metrics = await builder.flush() diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py index 675b1028..c16e5fd7 100644 --- a/tests/unit/test_track_api_cost.py +++ b/tests/unit/test_track_api_cost.py @@ -87,6 +87,7 @@ async def test_openai_cost_extraction_with_explicit_pricing(self) -> None: @track_api_cost( source="llm_judge/correctness", provider="openai", + model_name="openai/gpt-4.1", prompt_price_per_million=1.0, completion_price_per_million=2.0, ) @@ -109,6 +110,7 @@ async def test_openai_cost_extraction_accounts_for_cached_tokens(self) -> None: @track_api_cost( source="llm_judge/cached_openai", provider="openai", + model_name="openai/gpt-4.1", prompt_price_per_million=2.0, completion_price_per_million=8.0, cached_prompt_price_per_million=0.5, @@ -140,6 +142,7 @@ async def test_anthropic_cost_extraction_uses_registered_model_pricing(self) -> @track_api_cost( source="llm_judge/faithfulness", + provider="anthropic", model_name="anthropic/test-judge", ) async def _judge() -> _AnthropicResponse: @@ -190,14 +193,18 @@ async def _judge() -> _AnthropicResponse: assert metrics["costs/eval/llm_judge/anthropic_cache"] == pytest.approx(0.00495) @pytest.mark.asyncio - async def test_response_model_name_resolves_provider_scoped_global_pricing( + async def test_explicit_model_name_uses_global_pricing( self, ) -> None: builder = MetricsBuilder(cost_context="train") pricing = get_model_pricing("openai/gpt-oss-20b") assert pricing is not None - @track_api_cost(source="llm_judge/global_pricing", provider="openai") + @track_api_cost( + source="llm_judge/global_pricing", + provider="openai", + model_name="openai/gpt-oss-20b", + ) async def _judge() -> _OpenAIResponse: return _OpenAIResponse( prompt_tokens=1_000, @@ -223,7 +230,7 @@ async def _judge() -> _OpenAIResponse: ) @pytest.mark.asyncio - async def test_response_model_name_resolves_provider_scoped_registered_pricing( + async def test_explicit_model_name_uses_registered_pricing( self, ) -> None: builder = MetricsBuilder(cost_context="eval") @@ -233,7 +240,11 @@ async def test_response_model_name_resolves_provider_scoped_registered_pricing( completion_per_million=2.5, ) - @track_api_cost(source="llm_judge/provider_resolution", provider="anthropic") + @track_api_cost( + source="llm_judge/provider_resolution", + provider="anthropic", + model_name="anthropic/test-judge", + ) async def _judge() -> _AnthropicResponse: return _AnthropicResponse( input_tokens=400, @@ -253,10 +264,14 @@ async def _judge() -> _AnthropicResponse: ) @pytest.mark.asyncio - async def test_snapshot_model_name_resolves_to_global_pricing(self) -> None: + async def test_explicit_model_name_does_not_depend_on_response_model(self) -> None: builder = MetricsBuilder(cost_context="train") - @track_api_cost(source="llm_judge/snapshot", provider="openai") + @track_api_cost( + source="llm_judge/snapshot", + provider="openai", + model_name="openai/gpt-4.1", + ) async def _judge() -> _OpenAIResponse: return _OpenAIResponse( prompt_tokens=1_000, @@ -279,13 +294,17 @@ async def _judge() -> _OpenAIResponse: async def test_decorator_fails_fast_without_model_aware_pricing(self) -> None: builder = MetricsBuilder(cost_context="train") - @track_api_cost(source="llm_judge/missing_pricing", provider="openai") + @track_api_cost( + source="llm_judge/missing_pricing", + provider="openai", + model_name="openai/missing-pricing-model", + ) async def _judge() -> _OpenAIResponse: return _OpenAIResponse(prompt_tokens=10, completion_tokens=20) token = builder.activate() try: - with pytest.raises(ValueError, match="model-aware pricing"): + with pytest.raises(ValueError, match="No pricing configured"): await _judge() finally: token.var.reset(token) @@ -298,6 +317,7 @@ async def test_custom_extractor_takes_precedence(self) -> None: @track_api_cost( source="llm_judge/custom", provider="openai", + model_name="openai/gpt-4.1", prompt_price_per_million=1.0, completion_price_per_million=2.0, ) @@ -315,7 +335,11 @@ async def _judge() -> _OpenAIResponse: @pytest.mark.asyncio async def test_decorator_noops_without_active_builder(self) -> None: - @track_api_cost(source="llm_judge/no_context", provider="openai") + @track_api_cost( + source="llm_judge/no_context", + provider="openai", + model_name="openai/gpt-4.1", + ) async def _judge() -> _OpenAIResponse: return _OpenAIResponse(prompt_tokens=10, completion_tokens=20) @@ -330,6 +354,7 @@ async def test_for_cost_context_routes_to_eval_and_shares_state(self) -> None: @track_api_cost( source="llm_judge/correctness", provider="openai", + model_name="openai/gpt-4.1", prompt_price_per_million=1.0, completion_price_per_million=2.0, ) @@ -359,6 +384,7 @@ async def test_model_log_emits_train_and_eval_costs(self, tmp_path: Path) -> Non @track_api_cost( source="llm_judge/correctness", provider="openai", + model_name="openai/gpt-4.1", prompt_price_per_million=1.0, completion_price_per_million=2.0, ) @@ -368,6 +394,7 @@ async def _train_judge() -> _OpenAIResponse: @track_api_cost( source="llm_judge/factuality", provider="anthropic", + model_name="anthropic/claude-sonnet-4-6", prompt_price_per_million=3.0, completion_price_per_million=4.0, ) @@ -471,6 +498,7 @@ async def test_pipeline_trainer_activates_eval_context_for_eval_fn( @track_api_cost( source="llm_judge/correctness", provider="openai", + model_name="openai/gpt-4.1", prompt_price_per_million=1.0, completion_price_per_million=2.0, ) From a03299439a452a3f400bf070f2488544d5a5891c Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 15:44:48 -0700 Subject: [PATCH 44/46] docs: Replace metrics taxonomy note --- docs/docs.json | 3 +- docs/features/tracking-metrics.mdx | 167 +++++++++++++++++++++++++++ docs/getting-started/quick-start.mdx | 2 +- docs/metrics-taxonomy.md | 149 ------------------------ 4 files changed, 170 insertions(+), 151 deletions(-) create mode 100644 docs/features/tracking-metrics.mdx delete mode 100644 docs/metrics-taxonomy.md diff --git a/docs/docs.json b/docs/docs.json index 99f5675c..2b99e176 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -67,6 +67,7 @@ "features/checkpoint-forking", "features/checkpoint-deletion", "features/additional-histories", + "features/tracking-metrics", "features/mcp-rl" ] }, @@ -106,4 +107,4 @@ "bluesky": "https://bsky.app/profile/openpipe.bsky.social", "github": "https://github.com/openpipe/ART" } -} \ No newline at end of file +} diff --git a/docs/features/tracking-metrics.mdx b/docs/features/tracking-metrics.mdx new file mode 100644 index 00000000..3aea84e9 --- /dev/null +++ b/docs/features/tracking-metrics.mdx @@ -0,0 +1,167 @@ +--- +title: "Tracking Metrics" +description: "See what ART logs automatically and how to add your own metrics and costs." +sidebarTitle: "Tracking Metrics" +icon: "chart-line" +--- + +ART writes a metrics row every time you call `model.log(...)`. Those rows go to +`history.jsonl` in the run directory and, if W&B logging is enabled, to W&B. + +Use this page for three things: + +- understand the metrics ART emits automatically +- add task-specific metrics from your own rollout code +- track external judge and API spend alongside training metrics + +## What ART logs automatically + +When you call `await model.train(...)` or `await model.log(train_groups, split="train")`, +ART already logs most of the metrics you need to monitor a run. + +| Type | Examples | +| --- | --- | +| Reward | `reward/mean`, `reward/std_dev`, `reward/exception_rate` | +| Loss | `loss/train`, `loss/entropy`, `loss/kl_div`, `loss/grad_norm`, `loss/learning_rate` | +| Data | `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted`, `data/step_num_groups_trainable` | +| Train summary | `train/num_groups_submitted`, `train/num_groups_trainable`, `train/num_trajectories` | +| Time | `time/wall_clock_sec`, `time/step_wall_s`, `time/step_trainer_s` | +| Cost | `costs/gpu` on `LocalBackend` when GPU pricing is known | + +If ART has the inputs it needs, it also derives: + +- cumulative metrics such as `time/cum/trainer_s`, `data/cum/num_unique_scenarios`, and `costs/cum/all` +- cost rollups such as `costs/train`, `costs/eval`, and `costs/all` +- throughput metrics such as `throughput/avg_trainer_tok_per_s` and `throughput/avg_actor_tok_per_s` + + + Some metrics only appear when the backend or your code provides the underlying + inputs. For example, `throughput/avg_actor_tok_per_s` requires both + `data/step_actor_tokens` and `time/step_actor_s`. + + +## Add task-specific outcome metrics + +Attach metrics directly to each `Trajectory` when your rollout code knows whether +an attempt succeeded, how many tools it called, or any other task-specific +signal. + +```python +async def rollout(model: art.Model, scenario: Scenario) -> art.Trajectory: + trajectory = art.Trajectory( + messages_and_choices=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": scenario.prompt}, + ], + metadata={"scenario_id": scenario.id}, + ) + + completion = await model.openai_client().chat.completions.create( + model=model.get_inference_name(), + messages=trajectory.messages(), + ) + trajectory.messages_and_choices.append(completion.choices[0]) + + trajectory.reward = score_reward(trajectory) + trajectory.metrics["correct"] = float(is_correct(trajectory)) + trajectory.metrics["tool_calls"] = float(count_tool_calls(trajectory)) + return trajectory +``` + +On train steps, ART averages those rollout metrics and logs them under the +`reward/` namespace, such as `reward/correct` and `reward/tool_calls`. + +If you want to record one value per `TrajectoryGroup` instead of one per +trajectory, pass `metrics={...}` when you build the group. ART logs those once +per group, using keys like `reward/group_difficulty` on train steps. + +## Add step-level metrics ART cannot infer + +Use `model.metrics_builder()` for metrics that live outside individual +trajectories, such as actor-side timing, token counts, or idle time. + +```python +builder = model.metrics_builder() + +with builder.measure("time/step_actor_s"): + result = await run_rollouts() + +builder.add_data( + step_num_scenarios=result.num_scenarios, + step_actor_tokens=result.actor_tokens, + scenario_ids=result.scenario_ids, +) +builder.add_idle_times(step_actor_idle_s=result.actor_idle_s) + +await model.log(result.train_groups, split="train", step=result.step) +``` + +A few useful patterns: + +- log `scenario_ids` to unlock `data/cum/num_unique_scenarios` +- log both `data/step_actor_tokens` and `time/step_actor_s` to unlock actor throughput metrics +- log `time/step_eval_s` when eval runs happen outside the backend +- use fully qualified keys like `time/step_actor_s` or `data/step_actor_tokens` for builder-managed metrics + +ART flushes builder-managed metrics on the next `model.log(...)` or +`model.train(...)` call. + +## Track judge and API costs + +Use `@track_api_cost` when a function returns a provider response object with +token usage. Wrap the relevant part of your code in a metrics context so ART +knows whether the spend belongs to training or evaluation. + +```python +from art.metrics import track_api_cost + +@track_api_cost( + source="llm_judge/correctness", + provider="openai", + model_name="openai/gpt-4.1", +) +async def run_judge(client, messages): + return await client.chat.completions.create( + model="gpt-4.1", + messages=messages, + ) + +with model.metrics_builder("train").activate_context(): + await run_judge(judge_client, train_messages) + +with model.metrics_builder("eval").activate_context(): + await run_judge(judge_client, eval_messages) +``` + +The next metrics row will include: + +- `costs/train/llm_judge/correctness` or `costs/eval/llm_judge/correctness` +- rollups such as `costs/train`, `costs/eval`, and `costs/all` +- cumulative totals such as `costs/cum/all` + +ART can price OpenAI and Anthropic responses from their usage fields. You must +pass both `provider` and `model_name` to `@track_api_cost`. + +For custom pricing or unsupported models, register pricing on the builder: + +```python +builder = model.metrics_builder() +builder.register_model_pricing( + "anthropic/my-custom-judge", + prompt_per_million=1.2, + completion_per_million=4.8, +) +``` + +## Track GPU cost on LocalBackend + +`LocalBackend` can log `costs/gpu` automatically on train steps. ART currently +auto-detects H200 pricing at `$3/hour` per GPU. For other hardware, pass an +explicit override: + +```python +backend = LocalBackend(gpu_cost_per_hour_usd=2.25) +``` + +This lets ART include GPU spend in the same metrics stream as rewards, losses, +and judge/API costs. diff --git a/docs/getting-started/quick-start.mdx b/docs/getting-started/quick-start.mdx index 58eb0ccf..63a38e02 100644 --- a/docs/getting-started/quick-start.mdx +++ b/docs/getting-started/quick-start.mdx @@ -38,4 +38,4 @@ At the top of the [notebook](https://colab.research.google.com/github/openpipe/a ## Step 3: Track metrics -While your run progresses, observe its traces and metrics in your [W&B workspace](https://wandb.ai/home). You should start seeing some progress in the first 20-30 steps. If you have questions along the way, please ask in the [Discord](https://discord.gg/zbBHRUpwf4). Happy training! +While your run progresses, observe its traces and metrics in your [W&B workspace](https://wandb.ai/home). You should start seeing some progress in the first 20-30 steps. For a guide to the metrics ART logs automatically and how to add your own, see [Tracking Metrics](/features/tracking-metrics). If you have questions along the way, please ask in the [Discord](https://discord.gg/zbBHRUpwf4). Happy training! diff --git a/docs/metrics-taxonomy.md b/docs/metrics-taxonomy.md deleted file mode 100644 index 3bd28a11..00000000 --- a/docs/metrics-taxonomy.md +++ /dev/null @@ -1,149 +0,0 @@ -# Metrics Taxonomy (Phase 1-3) - -Phase 1 introduces sectioned metric namespaces and hierarchical cost rollups. - -## Sections - -- `reward/*` -- `loss/*` -- `throughput/*` -- `costs/*` -- `time/*` -- `data/*` -- `train/*`, `val/*`, `test/*` - -## Backend Output - -ART backends emit canonical sectioned keys directly. The canonical training keys include: - -- `reward/mean` -- `reward/std_dev` -- `reward/exception_rate` -- `reward/group_` -- `loss/train` -- `loss/entropy` -- `loss/kl_div` -- `loss/kl_policy_ref` -- `loss/grad_norm` -- `loss/learning_rate` -- `train/num_groups_submitted` -- `train/num_groups_trainable` -- `train/num_trajectories` -- `train/num_trainable_tokens` -- `data/step_trainer_tokens` -- `data/step_num_datums` -- `data/step_num_gradient_steps` - -## Cost Rollups - -Cost leaves can be logged with hierarchical keys, for example: - -- hierarchical keys, e.g. `costs/train/llm_judge/correctness` - -ART rolls costs up automatically: - -- parent rollups (for example `costs/train`, `costs/all`) -- cumulative keys under the `cum/` namespace (for example `costs/cum/all`) - -## Metrics Added By ART - -ART now emits the following metrics from library internals where the data is available: - -- `reward/*` aggregates from `model.log(..., split="train")` -- `loss/*` from trainer backends -- `time/wall_clock_sec` and `training_step` on every logged row -- `time/step_trainer_s` for training calls -- `time/step_wall_s` from `PipelineTrainer` and `LocalBackend` train-step logs -- `time/step_actor_s`, `time/step_eval_s` from `PipelineTrainer` -- `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted` -- `data/step_num_groups_trainable` for train splits -- `data/cum/num_unique_scenarios` when `scenario_id` is present in group or trajectory metadata -- `data/step_trainer_tokens` where the backend knows the trainer token count -- `costs/gpu` on `LocalBackend` train-step logs when ART can resolve GPU pricing -- `throughput/cum/trainer_idle_s`, `throughput/cum/actor_idle_s` -- `throughput/avg_trainer_tok_per_s`, `throughput/avg_actor_tok_per_s` when both token and time inputs are available - -Some metrics remain user-owned because ART cannot infer them reliably for every workflow, especially actor token usage outside the pipeline trainer. - -For automatic GPU cost on `LocalBackend`, ART currently auto-detects H200s at -$3/hour per GPU. For other GPU types, pass `gpu_cost_per_hour_usd=...` to -`LocalBackend(...)` if you want ART to emit `costs/gpu` instead of skipping it. - -## User Helpers - -Use the builder helpers for step-level metrics that only user code can know: - -```python -builder = model.metrics_builder() - -with builder.measure("time/step_actor_s"): - result = await run_rollouts() - -builder.add_data( - step_actor_tokens=result.actor_tokens, - scenario_ids=result.scenario_ids, -) - -builder.add_idle_times(step_actor_idle_s=result.actor_idle_s) -``` - -If these metrics are logged before the next `model.log(...)` flush, ART will also emit the cumulative and derived throughput metrics automatically. - -## API Cost Decorator (Phase 2/3) - -Use `@track_api_cost` to automatically write judge/API spend into `costs/{train|eval}/...`. - -```python -from art.metrics import track_api_cost - -@track_api_cost( - source="llm_judge/correctness", - provider="openai", - model_name="openai/gpt-oss-20b", -) -async def run_judge(client, messages): - return await client.chat.completions.create( - model="gpt-oss-20b", - messages=messages, - ) -``` - -Activate metric cost context while running train/eval logic: - -```python -with model.metrics_builder("train").activate_context(): - await run_judge(client, train_messages) - -with model.metrics_builder("eval").activate_context(): - await run_judge(client, eval_messages) -``` - -The next `model.log(...)` flush for that step will include: - -- `costs/train/llm_judge/correctness` (or `costs/eval/...`) -- hierarchical rollups like `costs/train`, `costs/all` -- cumulative keys like `costs/cum/all` - -Built-in usage extraction: - -- OpenAI usage (`prompt_tokens`, `completion_tokens`) -- Anthropic usage (`input_tokens`, `output_tokens`) - -Pricing is model-aware by default. ART will use the configured model pricing from -`art.costs.MODEL_PRICING` and `art.api_costs.MODEL_TOKEN_PRICING` for an exact -`model_name` match, and it raises instead of guessing when pricing is missing. -`provider` and `model_name` are required on `@track_api_cost`; ART no longer -infers them from the response payload. - -You can still override pricing per decorator call or register model-specific -pricing on the builder: - -```python -builder = model.metrics_builder() -builder.register_model_pricing( - "anthropic/my-custom-judge", - prompt_per_million=1.2, - completion_per_million=4.8, -) -builder.register_cost_extractor("openai", lambda response: 0.001) # optional custom extractor -``` From 5e3c812365118894ef073f3c2d68085e2f74e89f Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 16:03:50 -0700 Subject: [PATCH 45/46] style: Apply ruff format --- dev/yes-no-maybe-metrics.py | 4 +--- src/art/api_costs.py | 10 +++------- src/art/local/backend.py | 4 +--- src/art/metrics.py | 21 +++++---------------- src/art/metrics_taxonomy.py | 13 ++++++++++--- src/art/model.py | 24 ++++++++++++++++-------- src/art/serverless/backend.py | 2 +- src/art/tinker_native/backend.py | 2 +- src/art/unsloth/train.py | 6 ++++-- tests/integration/test_live_api_cost.py | 6 +++--- tests/unit/test_frontend_logging.py | 6 +++--- tests/unit/test_metric_routing.py | 17 +++++++++-------- tests/unit/test_metrics_builder.py | 4 +++- tests/unit/test_track_api_cost.py | 4 +++- 14 files changed, 63 insertions(+), 60 deletions(-) diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py index cbb0c5fd..8bb80518 100644 --- a/dev/yes-no-maybe-metrics.py +++ b/dev/yes-no-maybe-metrics.py @@ -173,9 +173,7 @@ async def main() -> None: base_model = os.environ.get("BASE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507") project = os.environ.get("PROJECT", "yes-no-maybe-metrics") model = art.TrainableModel( - name=os.environ.get( - "MODEL_NAME", f"yes-no-maybe-metrics-{int(time.time())}" - ), + name=os.environ.get("MODEL_NAME", f"yes-no-maybe-metrics-{int(time.time())}"), project=project, base_model=base_model, report_metrics=["wandb"], diff --git a/src/art/api_costs.py b/src/art/api_costs.py index 37b82b7b..67fe9dbd 100644 --- a/src/art/api_costs.py +++ b/src/art/api_costs.py @@ -140,9 +140,7 @@ def _extract_anthropic_token_counts(response: Any) -> _AnthropicTokenUsage | Non cache_creation_input_tokens = ( _read_usage_field(usage, "cache_creation_input_tokens") or 0.0 ) - cache_read_input_tokens = ( - _read_usage_field(usage, "cache_read_input_tokens") or 0.0 - ) + cache_read_input_tokens = _read_usage_field(usage, "cache_read_input_tokens") or 0.0 if ( input_tokens is None and output_tokens is None @@ -230,7 +228,7 @@ def _estimate_provider_cost( return _estimate_anthropic_cost( _extract_anthropic_token_counts(response), pricing, - ) + ) return None @@ -399,9 +397,7 @@ def extract_api_cost( f"Response usage does not match provider '{provider_name}'. " "Pass the correct provider/model pair or register a custom cost extractor." ) - raise ValueError( - f"No cost extractor registered for provider '{provider_name}'." - ) + raise ValueError(f"No cost extractor registered for provider '{provider_name}'.") def _record_api_cost( diff --git a/src/art/local/backend.py b/src/art/local/backend.py index d2df1112..c8b0570a 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -96,9 +96,7 @@ def __init__( self._in_process = in_process self._path = path or get_default_art_path() self._gpu_cost_per_hour_usd = ( - float(gpu_cost_per_hour_usd) - if gpu_cost_per_hour_usd is not None - else None + float(gpu_cost_per_hour_usd) if gpu_cost_per_hour_usd is not None else None ) os.makedirs(self._path, exist_ok=True) diff --git a/src/art/metrics.py b/src/art/metrics.py index 11d1e4d6..eda9ab9c 100644 --- a/src/art/metrics.py +++ b/src/art/metrics.py @@ -236,9 +236,7 @@ def for_cost_context(self, cost_context: str) -> "MetricsBuilder": _shared_state=self._shared_state, ) - def register_cost_extractor( - self, provider: str, extractor: CostExtractor - ) -> None: + def register_cost_extractor(self, provider: str, extractor: CostExtractor) -> None: normalized_provider = normalize_provider(provider) if normalized_provider is None: raise ValueError("provider must be non-empty") @@ -356,16 +354,9 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None: self._shared_state.cum_state[cum_key] = next_value result[cum_key] = next_value - if ( - "data/step_trainer_tokens" in result - or "time/step_trainer_s" in result - ): - trainer_tokens = self._shared_state.cum_state.get( - "data/cum/trainer_tokens" - ) - trainer_seconds = self._shared_state.cum_state.get( - "time/cum/trainer_s" - ) + if "data/step_trainer_tokens" in result or "time/step_trainer_s" in result: + trainer_tokens = self._shared_state.cum_state.get("data/cum/trainer_tokens") + trainer_seconds = self._shared_state.cum_state.get("time/cum/trainer_s") if ( trainer_tokens is not None and trainer_seconds is not None @@ -376,9 +367,7 @@ def _update_throughput_metrics(self, result: dict[str, float]) -> None: ) if "data/step_actor_tokens" in result or "time/step_actor_s" in result: - actor_tokens = self._shared_state.cum_state.get( - "data/cum/actor_tokens" - ) + actor_tokens = self._shared_state.cum_state.get("data/cum/actor_tokens") actor_seconds = self._shared_state.cum_state.get("time/cum/actor_s") if ( actor_tokens is not None diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py index 6d8adcc4..6965b68d 100644 --- a/src/art/metrics_taxonomy.py +++ b/src/art/metrics_taxonomy.py @@ -9,7 +9,9 @@ _INVARIANT_METRIC_KEYS = frozenset({TRAIN_GRADIENT_STEPS_KEY}) -def average_metric_samples(metric_samples: Iterable[dict[str, float]]) -> dict[str, float]: +def average_metric_samples( + metric_samples: Iterable[dict[str, float]], +) -> dict[str, float]: totals: dict[str, float] = {} counts: dict[str, int] = {} invariant_values: dict[str, float] = {} @@ -65,7 +67,9 @@ def summarize_trajectory_groups( return TrajectoryBatchSummary( num_scenarios=len(groups), - num_trajectories=sum(len(group.trajectories) + len(group.exceptions) for group in groups), + num_trajectories=sum( + len(group.trajectories) + len(group.exceptions) for group in groups + ), num_groups_submitted=len(groups), num_groups_trainable=sum(1 for group in groups if _group_is_trainable(group)), scenario_ids=scenario_ids, @@ -117,7 +121,10 @@ def _group_is_trainable(group: TrajectoryGroup) -> bool: def _extract_scenario_id(group: TrajectoryGroup) -> str | None: - for metadata in [group.metadata, *(trajectory.metadata for trajectory in group.trajectories)]: + for metadata in [ + group.metadata, + *(trajectory.metadata for trajectory in group.trajectories), + ]: scenario_id = _extract_scenario_id_from_metadata(metadata) if scenario_id is not None: return scenario_id diff --git a/src/art/model.py b/src/art/model.py index 13f8ed1a..a5ea06c0 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -153,9 +153,13 @@ def __init__( object.__setattr__(self, "_wandb_defined_metrics", set()) object.__setattr__(self, "_run_start_time", time.time()) object.__setattr__(self, "_run_start_monotonic", time.monotonic()) - object.__setattr__(self, "_last_local_train_log_monotonic", self._run_start_monotonic) + object.__setattr__( + self, "_last_local_train_log_monotonic", self._run_start_monotonic + ) object.__setattr__(self, "_last_local_train_step", None) - object.__setattr__(self, "_metrics_builder", MetricsBuilder(cost_context="train")) + object.__setattr__( + self, "_metrics_builder", MetricsBuilder(cost_context="train") + ) object.__setattr__(self, "_metrics_builder_state_loaded", False) @overload @@ -548,7 +552,9 @@ def _collect_automatic_backend_metrics( if "time/step_wall_s" not in provided_metric_keys: automatic_metrics["time/step_wall_s"] = step_wall_s - gpu_cost_getter = getattr(self._backend, "automatic_gpu_cost_per_hour_usd", None) + gpu_cost_getter = getattr( + self._backend, "automatic_gpu_cost_per_hour_usd", None + ) if callable(gpu_cost_getter) and "costs/gpu" not in provided_metric_keys: gpu_cost_per_hour_usd = gpu_cost_getter(self) if gpu_cost_per_hour_usd is not None: @@ -597,9 +603,7 @@ def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder: return self._metrics_builder return self._metrics_builder.for_cost_context(cost_context) - def activate_metrics_context( - self, cost_context: str - ) -> Token[MetricsBuilder]: + def activate_metrics_context(self, cost_context: str) -> Token[MetricsBuilder]: return self.metrics_builder(cost_context).activate() def _load_metrics_builder_state(self) -> None: @@ -777,7 +781,9 @@ async def log( for metric, values in group_metrics.items(): if len(values) > 0: group_key = ( - f"reward/group_{metric}" if split == "train" else f"group_metric_{metric}" + f"reward/group_{metric}" + if split == "train" + else f"group_metric_{metric}" ) averages[group_key] = sum(values) / len(values) @@ -1080,4 +1086,6 @@ async def train_sft( avg_metrics["time/step_trainer_s"] = trainer_elapsed # Get the current step after training step = await self.get_step() - await self.log(trajectories=None, split="train", metrics=avg_metrics, step=step) + await self.log( + trajectories=None, split="train", metrics=avg_metrics, step=step + ) diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py index 4bb4d5f3..fcb9f68f 100644 --- a/src/art/serverless/backend.py +++ b/src/art/serverless/backend.py @@ -63,7 +63,7 @@ def _canonicalize_upstream_metric_key(metric: str) -> str: if metric == "tokens_per_second": return "" if metric.startswith("group_metric_"): - return f"reward/group_{metric[len('group_metric_'):]}" + return f"reward/group_{metric[len('group_metric_') :]}" return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py index 9d234944..500a850f 100644 --- a/src/art/tinker_native/backend.py +++ b/src/art/tinker_native/backend.py @@ -77,7 +77,7 @@ def _canonicalize_upstream_metric_key(metric: str) -> str: if metric == "tokens_per_second": return "" if metric.startswith("group_metric_"): - return f"reward/group_{metric[len('group_metric_'):]}" + return f"reward/group_{metric[len('group_metric_') :]}" return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py index 2a58879b..d93569d4 100644 --- a/src/art/unsloth/train.py +++ b/src/art/unsloth/train.py @@ -45,7 +45,7 @@ def _canonicalize_upstream_metric_key(metric: str) -> str: if metric == "tokens_per_second": return "" if metric.startswith("group_metric_"): - return f"reward/group_{metric[len('group_metric_'):]}" + return f"reward/group_{metric[len('group_metric_') :]}" return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) @@ -237,7 +237,9 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None: } results_queue.put_nowait({**normalized_metrics, **normalized_logs}) else: - results_queue.put_nowait({**_canonicalize_upstream_metrics(logs), **metrics}) + results_queue.put_nowait( + {**_canonicalize_upstream_metrics(logs), **metrics} + ) trainer._metrics["train"].clear() return log diff --git a/tests/integration/test_live_api_cost.py b/tests/integration/test_live_api_cost.py index c2bd733c..ad7438be 100644 --- a/tests/integration/test_live_api_cost.py +++ b/tests/integration/test_live_api_cost.py @@ -216,9 +216,9 @@ def _judge() -> dict: ) first_row, second_row = _history_rows(history_path) - assert first_row["costs/eval/llm_judge/anthropic_prompt_cache"] == pytest.approx( - first_expected_cost - ) + assert first_row[ + "costs/eval/llm_judge/anthropic_prompt_cache" + ] == pytest.approx(first_expected_cost) assert second_row[ "costs/eval/llm_judge/anthropic_prompt_cache" ] == pytest.approx(second_expected_cost) diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 1f65880d..2afb8af6 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -349,9 +349,9 @@ async def test_metric_prefixes(self, tmp_path: Path): "time/wall_clock_sec", ] ] - assert all( - k.startswith(("val/", "data/")) for k in metric_keys - ), f"Not all metrics routed into taxonomy namespaces: {metric_keys}" + assert all(k.startswith(("val/", "data/")) for k in metric_keys), ( + f"Not all metrics routed into taxonomy namespaces: {metric_keys}" + ) assert entry["training_step"] == 0 assert entry["time/wall_clock_sec"] >= 0 diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py index f9904527..8f6cad92 100644 --- a/tests/unit/test_metric_routing.py +++ b/tests/unit/test_metric_routing.py @@ -58,8 +58,7 @@ def test_get_wandb_run_registers_taxonomy_sections(self, tmp_path: Path) -> None assert run is fake_run define_calls = [ - (call.args, call.kwargs) - for call in fake_wandb.define_metric.call_args_list + (call.args, call.kwargs) for call in fake_wandb.define_metric.call_args_list ] assert define_calls == [ (("training_step",), {}), @@ -104,14 +103,16 @@ def test_log_metrics_defines_nested_cost_keys_with_training_step( ) define_calls = [ - (call.args, call.kwargs) - for call in fake_wandb.define_metric.call_args_list + (call.args, call.kwargs) for call in fake_wandb.define_metric.call_args_list ] - assert (("costs/train/sample",), {"step_metric": "training_step"}) in define_calls assert ( - (("costs/cum/train/prefill",), {"step_metric": "training_step"}) - in define_calls - ) + ("costs/train/sample",), + {"step_metric": "training_step"}, + ) in define_calls + assert ( + ("costs/cum/train/prefill",), + {"step_metric": "training_step"}, + ) in define_calls fake_run.log.assert_called_once() logged_metrics = fake_run.log.call_args.args[0] assert logged_metrics["costs/train/sample"] == 0.1 diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py index 5031df14..dfa24a11 100644 --- a/tests/unit/test_metrics_builder.py +++ b/tests/unit/test_metrics_builder.py @@ -79,7 +79,9 @@ async def test_helper_metrics_accumulate_within_a_single_step(self) -> None: assert metrics["throughput/step_actor_idle_s"] == pytest.approx(3.0) @pytest.mark.asyncio - async def test_throughput_metrics_derive_from_time_and_token_cumulatives(self) -> None: + async def test_throughput_metrics_derive_from_time_and_token_cumulatives( + self, + ) -> None: builder = MetricsBuilder(cost_context="train") builder.add_metric("time/step_trainer_s", 4.0) diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py index c16e5fd7..80553e48 100644 --- a/tests/unit/test_track_api_cost.py +++ b/tests/unit/test_track_api_cost.py @@ -132,7 +132,9 @@ async def _judge() -> _OpenAIResponse: assert metrics["costs/train/llm_judge/cached_openai"] == pytest.approx(0.00255) @pytest.mark.asyncio - async def test_anthropic_cost_extraction_uses_registered_model_pricing(self) -> None: + async def test_anthropic_cost_extraction_uses_registered_model_pricing( + self, + ) -> None: builder = MetricsBuilder(cost_context="train") builder.register_model_pricing( "anthropic/test-judge", From b934c25270474ebac9052021cae6faa67aff6152 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Tue, 10 Mar 2026 16:15:17 -0700 Subject: [PATCH 46/46] fix: Resolve ty failures in API cost and Unsloth --- src/art/api_costs.py | 13 +++++++------ src/art/unsloth/service.py | 19 +++++++++++++++---- src/art/unsloth/train.py | 4 ++-- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/art/api_costs.py b/src/art/api_costs.py index 67fe9dbd..1bbd9ed9 100644 --- a/src/art/api_costs.py +++ b/src/art/api_costs.py @@ -1,10 +1,10 @@ from __future__ import annotations -from collections.abc import Callable, Mapping +from collections.abc import Awaitable, Callable, Mapping from dataclasses import dataclass from functools import wraps from inspect import iscoroutinefunction -from typing import Any, ParamSpec, TypeVar +from typing import Any, ParamSpec, TypeVar, cast from .costs import get_model_pricing, tokens_to_cost @@ -459,10 +459,11 @@ def track_api_cost( def _decorate(func: Callable[P, R]) -> Callable[P, R]: if iscoroutinefunction(func): + async_func = cast(Callable[P, Awaitable[Any]], func) @wraps(func) - async def _async_wrapper(*args: P.args, **kwargs: P.kwargs): - result = await func(*args, **kwargs) + async def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any: + result = await async_func(*args, **kwargs) _record_api_cost( result=result, source=normalized_source, @@ -477,10 +478,10 @@ async def _async_wrapper(*args: P.args, **kwargs: P.kwargs): ) return result - return _async_wrapper + return cast(Callable[P, R], _async_wrapper) @wraps(func) - def _sync_wrapper(*args: P.args, **kwargs: P.kwargs): + def _sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R: result = func(*args, **kwargs) _record_api_cost( result=result, diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index 9d16bf7d..76ab1991 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -13,6 +13,7 @@ from datasets import Dataset import peft import torch +from torch.optim import Optimizer from transformers import GenerationMixin, PreTrainedModel from transformers.tokenization_utils_base import PreTrainedTokenizerBase from trl import GRPOConfig, GRPOTrainer @@ -190,6 +191,13 @@ def save_checkpoint( return checkpoint_dir +def _get_trainer_optimizer(trainer: GRPOTrainer) -> Optimizer: + optimizer = cast(Optimizer | None, getattr(trainer, "optimizer", None)) + if optimizer is None: + raise RuntimeError("Trainer optimizer must be initialized before training") + return optimizer + + # ============================================================================ # Model Classes # ============================================================================ @@ -541,10 +549,11 @@ def _reset_optimizer_if_mode_changed( mode_changed = ( self._last_training_mode is not None and self._last_training_mode != mode ) + optimizer = _get_trainer_optimizer(self._state.trainer) if mode_changed: # Clear all optimizer state (exp_avg, exp_avg_sq, step for each param) - self._state.trainer.optimizer.state.clear() + optimizer.state.clear() self._last_training_mode = mode @@ -576,9 +585,10 @@ async def _train_dedicated( ) -> AsyncIterator[dict[str, float]]: """Train in dedicated mode — no sleep/wake, vLLM keeps running on separate GPU.""" self._reset_optimizer_if_mode_changed("rl") + optimizer = _get_trainer_optimizer(self._state.trainer) rl_weight_decay = 0.1 - for param_group in self._state.trainer.optimizer.param_groups: + for param_group in optimizer.param_groups: param_group["weight_decay"] = rl_weight_decay packed_tensors = packed_tensors_from_dir(**disk_packed_tensors) @@ -661,10 +671,11 @@ async def _train_shared( # Reset optimizer state if switching from SFT to RL self._reset_optimizer_if_mode_changed("rl") + optimizer = _get_trainer_optimizer(self._state.trainer) # Set RL-specific hyperparameters rl_weight_decay = 0.1 - for param_group in self._state.trainer.optimizer.param_groups: + for param_group in optimizer.param_groups: param_group["weight_decay"] = rl_weight_decay # Load packed tensors @@ -794,7 +805,7 @@ async def train_sft( # Get model and optimizer peft_model = self._state.peft_model self._reset_optimizer_if_mode_changed("sft") - optimizer = self._state.trainer.optimizer + optimizer = _get_trainer_optimizer(self._state.trainer) # Set SFT-specific hyperparameters sft_weight_decay = 0.01 diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py index d93569d4..399c1c72 100644 --- a/src/art/unsloth/train.py +++ b/src/art/unsloth/train.py @@ -3,7 +3,7 @@ from contextlib import contextmanager, nullcontext import gc import os -from typing import TYPE_CHECKING, Callable, cast +from typing import TYPE_CHECKING, Any, Callable, cast import nest_asyncio from peft.peft_model import PeftModel @@ -220,7 +220,7 @@ def compute_loss( def get_log_fn( - trainer: "GRPOTrainer", results_queue: asyncio.Queue[dict[str, float]] + trainer: Any, results_queue: asyncio.Queue[dict[str, float]] ) -> Callable[..., None]: def log(logs: dict[str, float], start_time: float | None = None) -> None: metrics = {