diff --git a/dev/yes-no-maybe-metrics.py b/dev/yes-no-maybe-metrics.py new file mode 100644 index 000000000..8bb80518d --- /dev/null +++ b/dev/yes-no-maybe-metrics.py @@ -0,0 +1,259 @@ +"""Yes-no-maybe metrics demo for the LocalBackend `model.train()` path. + +This keeps the same prompt family, rollout structure, and reward ordering as +`dev/yes-no-maybe.py` while adding explicit metrics taxonomy instrumentation for +actor/eval timing and data metrics, while relying on LocalBackend for automatic +step wall time and GPU cost logging. +""" + +from __future__ import annotations + +import asyncio +from itertools import permutations +import os +import time + +from dotenv import load_dotenv +import openai + +try: + import unsloth # noqa: F401 +except ImportError: + pass + +import art +from art.local import LocalBackend + + +async def create_chat_completion( + client: openai.AsyncOpenAI, + *, + model_name: str, + messages: art.Messages, + max_tokens: int, + timeout: float, +) -> openai.types.chat.chat_completion.ChatCompletion: + return await client.chat.completions.create( + messages=messages, + model=model_name, + max_tokens=max_tokens, + timeout=timeout, + ) + + +def with_quotes(word: str) -> str: + return f"'{word}'" + + +def build_prompts() -> list[str]: + return [ + f"{prefix} with {', '.join([with_quotes(word) if use_quotes else word for word in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}" + for prefix in ["respond", "just respond"] + for use_quotes in [True, False] + for words in ( + list(permutation) + for length in [3, 2] + for permutation in permutations(["yes", "no", "maybe"], length) + ) + ] + + +def reward_for_answer(content: str | None) -> float: + if content == "yes": + return 0.5 + if content == "no": + return 0.75 + if content == "maybe": + return 1.0 + return 0.0 + + +def scenario_id_for_prompt(prompt: str) -> str: + return prompt.replace(" ", "_").replace("'", "") + + +def response_total_tokens( + response: openai.types.chat.chat_completion.ChatCompletion, +) -> int: + usage = response.usage + if usage is None: + return 0 + prompt_tokens = int(usage.prompt_tokens or 0) + completion_tokens = int(usage.completion_tokens or 0) + return prompt_tokens + completion_tokens + + +def total_actor_tokens(groups: list[art.TrajectoryGroup]) -> int: + return sum( + int(trajectory.metadata.get("actor_total_tokens", 0) or 0) + for group in groups + for trajectory in group.trajectories + ) + + +async def rollout( + client: openai.AsyncOpenAI, + model: art.TrainableModel, + prompt: str, + *, + max_tokens: int, + timeout: float, +) -> art.Trajectory: + messages: art.Messages = [{"role": "user", "content": prompt}] + chat_completion = await create_chat_completion( + client, + model_name=model.get_inference_name(), + messages=messages, + max_tokens=max_tokens, + timeout=timeout, + ) + choice = chat_completion.choices[0] + content = choice.message.content + return art.Trajectory( + messages_and_choices=[*messages, choice], + reward=reward_for_answer(content), + metadata={ + "scenario_id": scenario_id_for_prompt(prompt), + "actor_total_tokens": response_total_tokens(chat_completion), + }, + metrics={ + "valid_answer": reward_for_answer(content) > 0.0, + }, + ) + + +async def evaluate( + client: openai.AsyncOpenAI, + model: art.TrainableModel, + prompts: list[str], + *, + max_tokens: int, + timeout: float, +) -> list[art.TrajectoryGroup]: + groups = await art.gather_trajectory_groups( + art.TrajectoryGroup( + [ + rollout( + client, + model, + prompt, + max_tokens=max_tokens, + timeout=timeout, + ) + ], + metadata={"scenario_id": scenario_id_for_prompt(prompt)}, + ) + for prompt in prompts + ) + return groups + + +def print_history_summary(model: art.TrainableModel) -> None: + history_path = ( + model.base_path + f"/{model.project}/models/{model.name}/history.jsonl" + ) + print(f"History: {history_path}") + + +def build_internal_config() -> art.dev.InternalModelConfig: + return art.dev.InternalModelConfig( + engine_args=art.dev.EngineArgs( + gpu_memory_utilization=float( + os.environ.get("GPU_MEMORY_UTILIZATION", "0.85") + ), + max_model_len=int(os.environ.get("MAX_MODEL_LEN", "4096")), + ) + ) + + +async def main() -> None: + load_dotenv() + + backend = LocalBackend() + base_model = os.environ.get("BASE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507") + project = os.environ.get("PROJECT", "yes-no-maybe-metrics") + model = art.TrainableModel( + name=os.environ.get("MODEL_NAME", f"yes-no-maybe-metrics-{int(time.time())}"), + project=project, + base_model=base_model, + report_metrics=["wandb"], + _internal_config=build_internal_config(), + ) + try: + await model.register(backend) + + prompts = build_prompts() + eval_prompts = prompts[: int(os.environ.get("EVAL_PROMPTS", "12"))] + openai_client = model.openai_client() + max_steps = int(os.environ.get("NUM_STEPS", "20")) + rollouts_per_prompt = int(os.environ.get("ROLLOUTS_PER_PROMPT", "32")) + max_tokens = int(os.environ.get("MAX_TOKENS", "100")) + timeout = float(os.environ.get("TIMEOUT", "100")) + eval_every_n_steps = int(os.environ.get("EVAL_EVERY_N_STEPS", "1")) + learning_rate = float(os.environ.get("LEARNING_RATE", "1e-4")) + + start_step = await model.get_step() + for offset in range(max_steps): + current_step = start_step + offset + + if ( + eval_every_n_steps > 0 + and (current_step - start_step) % eval_every_n_steps == 0 + ): + eval_builder = model.metrics_builder("eval") + with eval_builder.activate_context(): + with eval_builder.measure("time/step_eval_s"): + val_groups = await evaluate( + openai_client, + model, + eval_prompts, + max_tokens=max_tokens, + timeout=timeout, + ) + eval_builder.add_data( + step_actor_tokens=total_actor_tokens(val_groups) + ) + await model.log(val_groups, split="val", step=current_step) + + train_builder = model.metrics_builder("train") + with train_builder.activate_context(): + with train_builder.measure("time/step_actor_s"): + train_groups = await art.gather_trajectory_groups( + ( + art.TrajectoryGroup( + rollout( + openai_client, + model, + prompt, + max_tokens=max_tokens, + timeout=timeout, + ) + for _ in range(rollouts_per_prompt) + ) + for prompt in prompts + ) + ) + train_builder.add_data( + step_actor_tokens=total_actor_tokens(train_groups) + ) + result = await backend.train( + model, + train_groups, + learning_rate=learning_rate, + ) + + await model.log( + split="train", + step=result.step, + trajectories=train_groups, + metrics=result.metrics, + ) + print(f"step {result.step} complete") + + print_history_summary(model) + finally: + await backend.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/docs.json b/docs/docs.json index 99f5675c7..2b99e176e 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -67,6 +67,7 @@ "features/checkpoint-forking", "features/checkpoint-deletion", "features/additional-histories", + "features/tracking-metrics", "features/mcp-rl" ] }, @@ -106,4 +107,4 @@ "bluesky": "https://bsky.app/profile/openpipe.bsky.social", "github": "https://github.com/openpipe/ART" } -} \ No newline at end of file +} diff --git a/docs/features/tracking-metrics.mdx b/docs/features/tracking-metrics.mdx new file mode 100644 index 000000000..3aea84e98 --- /dev/null +++ b/docs/features/tracking-metrics.mdx @@ -0,0 +1,167 @@ +--- +title: "Tracking Metrics" +description: "See what ART logs automatically and how to add your own metrics and costs." +sidebarTitle: "Tracking Metrics" +icon: "chart-line" +--- + +ART writes a metrics row every time you call `model.log(...)`. Those rows go to +`history.jsonl` in the run directory and, if W&B logging is enabled, to W&B. + +Use this page for three things: + +- understand the metrics ART emits automatically +- add task-specific metrics from your own rollout code +- track external judge and API spend alongside training metrics + +## What ART logs automatically + +When you call `await model.train(...)` or `await model.log(train_groups, split="train")`, +ART already logs most of the metrics you need to monitor a run. + +| Type | Examples | +| --- | --- | +| Reward | `reward/mean`, `reward/std_dev`, `reward/exception_rate` | +| Loss | `loss/train`, `loss/entropy`, `loss/kl_div`, `loss/grad_norm`, `loss/learning_rate` | +| Data | `data/step_num_scenarios`, `data/step_num_trajectories`, `data/step_num_groups_submitted`, `data/step_num_groups_trainable` | +| Train summary | `train/num_groups_submitted`, `train/num_groups_trainable`, `train/num_trajectories` | +| Time | `time/wall_clock_sec`, `time/step_wall_s`, `time/step_trainer_s` | +| Cost | `costs/gpu` on `LocalBackend` when GPU pricing is known | + +If ART has the inputs it needs, it also derives: + +- cumulative metrics such as `time/cum/trainer_s`, `data/cum/num_unique_scenarios`, and `costs/cum/all` +- cost rollups such as `costs/train`, `costs/eval`, and `costs/all` +- throughput metrics such as `throughput/avg_trainer_tok_per_s` and `throughput/avg_actor_tok_per_s` + + + Some metrics only appear when the backend or your code provides the underlying + inputs. For example, `throughput/avg_actor_tok_per_s` requires both + `data/step_actor_tokens` and `time/step_actor_s`. + + +## Add task-specific outcome metrics + +Attach metrics directly to each `Trajectory` when your rollout code knows whether +an attempt succeeded, how many tools it called, or any other task-specific +signal. + +```python +async def rollout(model: art.Model, scenario: Scenario) -> art.Trajectory: + trajectory = art.Trajectory( + messages_and_choices=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": scenario.prompt}, + ], + metadata={"scenario_id": scenario.id}, + ) + + completion = await model.openai_client().chat.completions.create( + model=model.get_inference_name(), + messages=trajectory.messages(), + ) + trajectory.messages_and_choices.append(completion.choices[0]) + + trajectory.reward = score_reward(trajectory) + trajectory.metrics["correct"] = float(is_correct(trajectory)) + trajectory.metrics["tool_calls"] = float(count_tool_calls(trajectory)) + return trajectory +``` + +On train steps, ART averages those rollout metrics and logs them under the +`reward/` namespace, such as `reward/correct` and `reward/tool_calls`. + +If you want to record one value per `TrajectoryGroup` instead of one per +trajectory, pass `metrics={...}` when you build the group. ART logs those once +per group, using keys like `reward/group_difficulty` on train steps. + +## Add step-level metrics ART cannot infer + +Use `model.metrics_builder()` for metrics that live outside individual +trajectories, such as actor-side timing, token counts, or idle time. + +```python +builder = model.metrics_builder() + +with builder.measure("time/step_actor_s"): + result = await run_rollouts() + +builder.add_data( + step_num_scenarios=result.num_scenarios, + step_actor_tokens=result.actor_tokens, + scenario_ids=result.scenario_ids, +) +builder.add_idle_times(step_actor_idle_s=result.actor_idle_s) + +await model.log(result.train_groups, split="train", step=result.step) +``` + +A few useful patterns: + +- log `scenario_ids` to unlock `data/cum/num_unique_scenarios` +- log both `data/step_actor_tokens` and `time/step_actor_s` to unlock actor throughput metrics +- log `time/step_eval_s` when eval runs happen outside the backend +- use fully qualified keys like `time/step_actor_s` or `data/step_actor_tokens` for builder-managed metrics + +ART flushes builder-managed metrics on the next `model.log(...)` or +`model.train(...)` call. + +## Track judge and API costs + +Use `@track_api_cost` when a function returns a provider response object with +token usage. Wrap the relevant part of your code in a metrics context so ART +knows whether the spend belongs to training or evaluation. + +```python +from art.metrics import track_api_cost + +@track_api_cost( + source="llm_judge/correctness", + provider="openai", + model_name="openai/gpt-4.1", +) +async def run_judge(client, messages): + return await client.chat.completions.create( + model="gpt-4.1", + messages=messages, + ) + +with model.metrics_builder("train").activate_context(): + await run_judge(judge_client, train_messages) + +with model.metrics_builder("eval").activate_context(): + await run_judge(judge_client, eval_messages) +``` + +The next metrics row will include: + +- `costs/train/llm_judge/correctness` or `costs/eval/llm_judge/correctness` +- rollups such as `costs/train`, `costs/eval`, and `costs/all` +- cumulative totals such as `costs/cum/all` + +ART can price OpenAI and Anthropic responses from their usage fields. You must +pass both `provider` and `model_name` to `@track_api_cost`. + +For custom pricing or unsupported models, register pricing on the builder: + +```python +builder = model.metrics_builder() +builder.register_model_pricing( + "anthropic/my-custom-judge", + prompt_per_million=1.2, + completion_per_million=4.8, +) +``` + +## Track GPU cost on LocalBackend + +`LocalBackend` can log `costs/gpu` automatically on train steps. ART currently +auto-detects H200 pricing at `$3/hour` per GPU. For other hardware, pass an +explicit override: + +```python +backend = LocalBackend(gpu_cost_per_hour_usd=2.25) +``` + +This lets ART include GPU spend in the same metrics stream as rewards, losses, +and judge/API costs. diff --git a/docs/getting-started/quick-start.mdx b/docs/getting-started/quick-start.mdx index 58eb0ccf0..63a38e02c 100644 --- a/docs/getting-started/quick-start.mdx +++ b/docs/getting-started/quick-start.mdx @@ -38,4 +38,4 @@ At the top of the [notebook](https://colab.research.google.com/github/openpipe/a ## Step 3: Track metrics -While your run progresses, observe its traces and metrics in your [W&B workspace](https://wandb.ai/home). You should start seeing some progress in the first 20-30 steps. If you have questions along the way, please ask in the [Discord](https://discord.gg/zbBHRUpwf4). Happy training! +While your run progresses, observe its traces and metrics in your [W&B workspace](https://wandb.ai/home). You should start seeing some progress in the first 20-30 steps. For a guide to the metrics ART logs automatically and how to add your own, see [Tracking Metrics](/features/tracking-metrics). If you have questions along the way, please ask in the [Discord](https://discord.gg/zbBHRUpwf4). Happy training! diff --git a/pyproject.toml b/pyproject.toml index f5563f66e..1941244ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,6 +116,9 @@ force-sort-within-sections = true [tool.pytest.ini_options] asyncio_mode = "auto" +markers = [ + "live_api_cost: opt-in live API cost validation against provider endpoints", +] [tool.uv] required-version = ">=0.6.15" diff --git a/src/art/api_costs.py b/src/art/api_costs.py new file mode 100644 index 000000000..1bbd9ed95 --- /dev/null +++ b/src/art/api_costs.py @@ -0,0 +1,502 @@ +from __future__ import annotations + +from collections.abc import Awaitable, Callable, Mapping +from dataclasses import dataclass +from functools import wraps +from inspect import iscoroutinefunction +from typing import Any, ParamSpec, TypeVar, cast + +from .costs import get_model_pricing, tokens_to_cost + +OPENAI_PROVIDER = "openai" +ANTHROPIC_PROVIDER = "anthropic" + +P = ParamSpec("P") +R = TypeVar("R") + +CostExtractor = Callable[[Any], float | None] +ResponseGetter = Callable[[Any], Any] + + +@dataclass(frozen=True) +class TokenPricing: + prompt_per_million: float + completion_per_million: float + cached_prompt_per_million: float | None = None + cache_creation_per_million: float | None = None + cache_read_per_million: float | None = None + + +@dataclass(frozen=True) +class _OpenAITokenUsage: + prompt_tokens: float + completion_tokens: float + cached_prompt_tokens: float + + +@dataclass(frozen=True) +class _AnthropicTokenUsage: + input_tokens: float + output_tokens: float + cache_creation_input_tokens: float + cache_read_input_tokens: float + + +MODEL_TOKEN_PRICING: dict[str, TokenPricing] = { + "openai/gpt-4.1": TokenPricing( + prompt_per_million=2.0, + completion_per_million=8.0, + cached_prompt_per_million=0.5, + ), + "anthropic/claude-sonnet-4-6": TokenPricing( + prompt_per_million=3.0, + completion_per_million=15.0, + cache_creation_per_million=3.75, + cache_read_per_million=0.30, + ), +} + + +def _configured_token_pricing(model_name: str) -> TokenPricing | None: + explicit = MODEL_TOKEN_PRICING.get(model_name) + if explicit is not None: + return explicit + + pricing = get_model_pricing(model_name) + if pricing is None: + return None + return TokenPricing( + prompt_per_million=pricing.prefill, + completion_per_million=pricing.sample, + ) + + +def normalize_provider(provider: str | None) -> str | None: + if provider is None: + return None + normalized = provider.strip().lower() + if not normalized: + return None + return normalized + + +def _read_usage_field(usage: Any, field: str) -> float | None: + if usage is None: + return None + if isinstance(usage, dict): + value = usage.get(field) + else: + value = getattr(usage, field, None) + if value is None: + return None + return float(value) + + +def _read_usage_nested_field(usage: Any, *fields: str) -> float | None: + current = usage + for field in fields: + if current is None: + return None + if isinstance(current, dict): + current = current.get(field) + else: + current = getattr(current, field, None) + if current is None: + return None + return float(current) + + +def _response_usage(response: Any) -> Any: + if isinstance(response, dict): + return response.get("usage") + return getattr(response, "usage", None) + + +def _extract_openai_token_counts(response: Any) -> _OpenAITokenUsage | None: + usage = _response_usage(response) + prompt_tokens = _read_usage_field(usage, "prompt_tokens") + completion_tokens = _read_usage_field(usage, "completion_tokens") + cached_prompt_tokens = ( + _read_usage_nested_field(usage, "prompt_tokens_details", "cached_tokens") or 0.0 + ) + if ( + prompt_tokens is None + and completion_tokens is None + and cached_prompt_tokens == 0.0 + ): + return None + total_prompt_tokens = prompt_tokens or 0.0 + return _OpenAITokenUsage( + prompt_tokens=total_prompt_tokens, + completion_tokens=completion_tokens or 0.0, + cached_prompt_tokens=min(cached_prompt_tokens, total_prompt_tokens), + ) + + +def _extract_anthropic_token_counts(response: Any) -> _AnthropicTokenUsage | None: + usage = _response_usage(response) + input_tokens = _read_usage_field(usage, "input_tokens") + output_tokens = _read_usage_field(usage, "output_tokens") + cache_creation_input_tokens = ( + _read_usage_field(usage, "cache_creation_input_tokens") or 0.0 + ) + cache_read_input_tokens = _read_usage_field(usage, "cache_read_input_tokens") or 0.0 + if ( + input_tokens is None + and output_tokens is None + and cache_creation_input_tokens == 0.0 + and cache_read_input_tokens == 0.0 + ): + return None + return _AnthropicTokenUsage( + input_tokens=input_tokens or 0.0, + output_tokens=output_tokens or 0.0, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, + ) + + +def _estimate_openai_cost( + token_counts: _OpenAITokenUsage | None, + pricing: TokenPricing, +) -> float | None: + if token_counts is None: + return None + uncached_prompt_tokens = max( + token_counts.prompt_tokens - token_counts.cached_prompt_tokens, + 0.0, + ) + cached_prompt_price = ( + pricing.cached_prompt_per_million + if pricing.cached_prompt_per_million is not None + else pricing.prompt_per_million + ) + return ( + tokens_to_cost(uncached_prompt_tokens, pricing.prompt_per_million) + + tokens_to_cost( + token_counts.cached_prompt_tokens, + cached_prompt_price, + ) + + tokens_to_cost( + token_counts.completion_tokens, + pricing.completion_per_million, + ) + ) + + +def _estimate_anthropic_cost( + token_counts: _AnthropicTokenUsage | None, + pricing: TokenPricing, +) -> float | None: + if token_counts is None: + return None + cache_creation_price = ( + pricing.cache_creation_per_million + if pricing.cache_creation_per_million is not None + else pricing.prompt_per_million + ) + cache_read_price = ( + pricing.cache_read_per_million + if pricing.cache_read_per_million is not None + else pricing.prompt_per_million + ) + return ( + tokens_to_cost(token_counts.input_tokens, pricing.prompt_per_million) + + tokens_to_cost( + token_counts.cache_creation_input_tokens, + cache_creation_price, + ) + + tokens_to_cost( + token_counts.cache_read_input_tokens, + cache_read_price, + ) + + tokens_to_cost( + token_counts.output_tokens, + pricing.completion_per_million, + ) + ) + + +def _estimate_provider_cost( + provider_name: str, + response: Any, + pricing: TokenPricing, +) -> float | None: + if provider_name == OPENAI_PROVIDER: + return _estimate_openai_cost(_extract_openai_token_counts(response), pricing) + if provider_name == ANTHROPIC_PROVIDER: + return _estimate_anthropic_cost( + _extract_anthropic_token_counts(response), + pricing, + ) + return None + + +def _resolve_registered_or_default_pricing( + model_name: str, + *, + model_pricing: Mapping[str, TokenPricing], +) -> TokenPricing | None: + registered = model_pricing.get(model_name) + if registered is not None: + return registered + return _configured_token_pricing(model_name) + + +def _merge_token_pricing( + *, + base_pricing: TokenPricing, + prompt_price_per_million: float | None, + completion_price_per_million: float | None, + cached_prompt_price_per_million: float | None, + cache_creation_price_per_million: float | None, + cache_read_price_per_million: float | None, +) -> TokenPricing: + return TokenPricing( + prompt_per_million=( + float(prompt_price_per_million) + if prompt_price_per_million is not None + else base_pricing.prompt_per_million + ), + completion_per_million=( + float(completion_price_per_million) + if completion_price_per_million is not None + else base_pricing.completion_per_million + ), + cached_prompt_per_million=( + float(cached_prompt_price_per_million) + if cached_prompt_price_per_million is not None + else base_pricing.cached_prompt_per_million + ), + cache_creation_per_million=( + float(cache_creation_price_per_million) + if cache_creation_price_per_million is not None + else base_pricing.cache_creation_per_million + ), + cache_read_per_million=( + float(cache_read_price_per_million) + if cache_read_price_per_million is not None + else base_pricing.cache_read_per_million + ), + ) + + +def normalize_model_name(model_name: str | None) -> str | None: + if model_name is None: + return None + normalized = model_name.strip() + if not normalized: + return None + return normalized + + +def _resolve_token_pricing( + *, + provider: str, + model_name: str, + prompt_price_per_million: float | None, + completion_price_per_million: float | None, + cached_prompt_price_per_million: float | None, + cache_creation_price_per_million: float | None, + cache_read_price_per_million: float | None, + model_pricing: Mapping[str, TokenPricing], +) -> TokenPricing: + explicit_prompt_price = ( + float(prompt_price_per_million) + if prompt_price_per_million is not None + else None + ) + explicit_completion_price = ( + float(completion_price_per_million) + if completion_price_per_million is not None + else None + ) + explicit_cached_prompt_price = ( + float(cached_prompt_price_per_million) + if cached_prompt_price_per_million is not None + else None + ) + explicit_cache_creation_price = ( + float(cache_creation_price_per_million) + if cache_creation_price_per_million is not None + else None + ) + explicit_cache_read_price = ( + float(cache_read_price_per_million) + if cache_read_price_per_million is not None + else None + ) + + if normalize_provider(provider) is None: + raise ValueError("provider must be non-empty") + + normalized_model_name = normalize_model_name(model_name) + if normalized_model_name is None: + raise ValueError("model_name must be non-empty") + + configured_pricing = _resolve_registered_or_default_pricing( + normalized_model_name, + model_pricing=model_pricing, + ) + if configured_pricing is None: + raise ValueError( + f"No pricing configured for model '{normalized_model_name}'. " + "Add it to art.api_costs.MODEL_TOKEN_PRICING, art.costs.MODEL_PRICING, " + "or register it with MetricsBuilder.register_model_pricing()." + ) + + return _merge_token_pricing( + base_pricing=configured_pricing, + prompt_price_per_million=explicit_prompt_price, + completion_price_per_million=explicit_completion_price, + cached_prompt_price_per_million=explicit_cached_prompt_price, + cache_creation_price_per_million=explicit_cache_creation_price, + cache_read_price_per_million=explicit_cache_read_price, + ) + + +def extract_api_cost( + response: Any, + *, + provider: str, + model_name: str, + prompt_price_per_million: float | None, + completion_price_per_million: float | None, + cached_prompt_price_per_million: float | None, + cache_creation_price_per_million: float | None, + cache_read_price_per_million: float | None, + cost_extractors: Mapping[str, CostExtractor], + model_pricing: Mapping[str, TokenPricing], +) -> float | None: + provider_name = normalize_provider(provider) + if provider_name is None: + raise ValueError("provider must be non-empty") + + custom_extractor = cost_extractors.get(provider_name) + if custom_extractor is not None: + custom_cost = custom_extractor(response) + if custom_cost is not None: + return float(custom_cost) + + pricing = _resolve_token_pricing( + provider=provider_name, + model_name=model_name, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, + model_pricing=model_pricing, + ) + provider_cost = _estimate_provider_cost(provider_name, response, pricing) + if provider_cost is not None: + return provider_cost + + if provider_name in {OPENAI_PROVIDER, ANTHROPIC_PROVIDER}: + raise ValueError( + f"Response usage does not match provider '{provider_name}'. " + "Pass the correct provider/model pair or register a custom cost extractor." + ) + raise ValueError(f"No cost extractor registered for provider '{provider_name}'.") + + +def _record_api_cost( + *, + result: Any, + source: str, + provider: str, + response_getter: ResponseGetter | None, + model_name: str, + prompt_price_per_million: float | None, + completion_price_per_million: float | None, + cached_prompt_price_per_million: float | None, + cache_creation_price_per_million: float | None, + cache_read_price_per_million: float | None, +) -> None: + try: + from .metrics import MetricsBuilder + + builder = MetricsBuilder.get_active() + except LookupError: + return + + response = response_getter(result) if response_getter is not None else result + builder.add_response_cost( + source, + response, + provider=provider, + model_name=model_name, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, + ) + + +def track_api_cost( + *, + source: str, + provider: str, + model_name: str, + response_getter: ResponseGetter | None = None, + prompt_price_per_million: float | None = None, + completion_price_per_million: float | None = None, + cached_prompt_price_per_million: float | None = None, + cache_creation_price_per_million: float | None = None, + cache_read_price_per_million: float | None = None, +) -> Callable[[Callable[P, R]], Callable[P, R]]: + normalized_source = source.strip("/") + if not normalized_source: + raise ValueError("source must be non-empty") + + normalized_provider = normalize_provider(provider) + if normalized_provider is None: + raise ValueError("provider must be non-empty") + normalized_model_name = normalize_model_name(model_name) + if normalized_model_name is None: + raise ValueError("model_name must be non-empty") + + def _decorate(func: Callable[P, R]) -> Callable[P, R]: + if iscoroutinefunction(func): + async_func = cast(Callable[P, Awaitable[Any]], func) + + @wraps(func) + async def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any: + result = await async_func(*args, **kwargs) + _record_api_cost( + result=result, + source=normalized_source, + provider=normalized_provider, + response_getter=response_getter, + model_name=normalized_model_name, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, + ) + return result + + return cast(Callable[P, R], _async_wrapper) + + @wraps(func) + def _sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + result = func(*args, **kwargs) + _record_api_cost( + result=result, + source=normalized_source, + provider=normalized_provider, + response_getter=response_getter, + model_name=normalized_model_name, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, + ) + return result + + return _sync_wrapper + + return _decorate diff --git a/src/art/costs.py b/src/art/costs.py index 5ee5523a9..e3e2b2b47 100644 --- a/src/art/costs.py +++ b/src/art/costs.py @@ -16,7 +16,7 @@ class ModelPricing: TokenCount: TypeAlias = int | None -CostCalculator: TypeAlias = Callable[[TokenCount, TokenCount], dict[str, float]] +CostCalculator: TypeAlias = Callable[[TokenCount, TokenCount, str], dict[str, float]] # Pricing per model ($/1M tokens). Keep in sync with infra pricing. MODEL_PRICING: dict[str, ModelPricing] = { @@ -88,16 +88,20 @@ def compute_sample_costs( *, prompt_tokens: int | None, completion_tokens: int | None, + cost_context: str, pricing: ModelPricing, ) -> dict[str, float]: """Compute prompt+completion costs for a single API call.""" + normalized_context = cost_context.strip("/") + if not normalized_context: + raise ValueError("cost_context must be non-empty") prompt_value = float(prompt_tokens or 0) completion_value = float(completion_tokens or 0) prefill_cost = tokens_to_cost(prompt_value, pricing.prefill) sample_cost = tokens_to_cost(completion_value, pricing.sample) return { - "costs_prefill": prefill_cost, - "costs_sample": sample_cost, + f"costs/{normalized_context}/prefill": prefill_cost, + f"costs/{normalized_context}/sample": sample_cost, } @@ -105,11 +109,14 @@ def build_cost_calculator(pricing: ModelPricing) -> CostCalculator: """Return a callable that computes prompt+completion costs for a request.""" def _calculator( - prompt_tokens: int | None, completion_tokens: int | None + prompt_tokens: int | None, + completion_tokens: int | None, + cost_context: str, ) -> dict[str, float]: return compute_sample_costs( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, + cost_context=cost_context, pricing=pricing, ) diff --git a/src/art/local/backend.py b/src/art/local/backend.py index 876768938..c8b0570a2 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -6,12 +6,17 @@ import shutil import socket import subprocess +import time from types import TracebackType from typing import AsyncIterator, Iterable, Literal, cast import warnings logger = logging.getLogger(__name__) +_AUTO_GPU_HOURLY_PRICING_USD = { + "H200": 3.0, +} + import aiohttp import numpy as np from openai import AsyncOpenAI @@ -39,6 +44,12 @@ from .. import dev from ..backend import AnyTrainableModel, Backend +from ..metrics_taxonomy import ( + TRAIN_GRADIENT_STEPS_KEY, + average_metric_samples, + build_training_summary_metrics, + summarize_trajectory_groups, +) from ..model import Model, TrainableModel from ..preprocessing.pack import ( PackedTensors, @@ -60,7 +71,13 @@ class LocalBackend(Backend): - def __init__(self, *, in_process: bool = False, path: str | None = None) -> None: + def __init__( + self, + *, + in_process: bool = False, + path: str | None = None, + gpu_cost_per_hour_usd: float | None = None, + ) -> None: """ Initializes a local, directory-based Backend interface at the given path. @@ -71,9 +88,16 @@ def __init__(self, *, in_process: bool = False, path: str | None = None) -> None Args: in_process: Whether to run the local service in-process. path: The path to the local directory. Defaults to "{repo_root}/.art". + gpu_cost_per_hour_usd: Optional per-GPU hourly price override used for + automatic `costs/gpu` accounting on train steps. When unset, + ART auto-detects supported GPU types (H200 at $3/hr today) and + skips GPU cost logging for unknown devices instead of guessing. """ self._in_process = in_process self._path = path or get_default_art_path() + self._gpu_cost_per_hour_usd = ( + float(gpu_cost_per_hour_usd) if gpu_cost_per_hour_usd is not None else None + ) os.makedirs(self._path, exist_ok=True) # Other initialization @@ -81,6 +105,57 @@ def __init__(self, *, in_process: bool = False, path: str | None = None) -> None self._tokenizers: dict[str, PreTrainedTokenizerBase] = {} self._image_processors: dict[str, BaseImageProcessor | None] = {} + def supports_automatic_train_step_metrics(self) -> bool: + return True + + def automatic_gpu_cost_per_hour_usd(self, model: Model) -> float | None: + per_gpu_cost = self._resolve_gpu_cost_per_hour_usd() + if per_gpu_cost is None: + return None + + gpu_count = self._allocated_gpu_count(model) + if gpu_count <= 0: + return None + return per_gpu_cost * gpu_count + + def _resolve_gpu_cost_per_hour_usd(self) -> float | None: + if self._gpu_cost_per_hour_usd is not None: + return self._gpu_cost_per_hour_usd + if not torch.cuda.is_available(): + return None + + num_visible_gpus = torch.cuda.device_count() + if num_visible_gpus <= 0: + return None + + resolved_costs: list[float] = [] + for index in range(num_visible_gpus): + device_name = torch.cuda.get_device_name(index).upper() + for gpu_name, hourly_cost in _AUTO_GPU_HOURLY_PRICING_USD.items(): + if gpu_name in device_name: + resolved_costs.append(hourly_cost) + break + else: + return None + + if not resolved_costs: + return None + if len(set(resolved_costs)) != 1: + return None + return resolved_costs[0] + + def _allocated_gpu_count(self, model: Model) -> int: + if isinstance(model, TrainableModel) and model._internal_config is not None: + trainer_gpu_ids = set(model._internal_config.get("trainer_gpu_ids", [])) + inference_gpu_ids = set(model._internal_config.get("inference_gpu_ids", [])) + allocated_gpu_ids = trainer_gpu_ids | inference_gpu_ids + if allocated_gpu_ids: + return len(allocated_gpu_ids) + + if not torch.cuda.is_available(): + return 0 + return torch.cuda.device_count() + def __enter__(self) -> Self: return self @@ -565,20 +640,28 @@ async def train( # type: ignore[override] # Collect metrics from training training_metrics: list[dict[str, float]] = [] + trainer_started = time.monotonic() async for metrics in self._train_model( model, groups_list, config, dev_config, verbose ): training_metrics.append(metrics) # Aggregate metrics - avg_metrics: dict[str, float] = {} - if training_metrics: - avg_metrics = { - k: sum(d.get(k, 0) for d in training_metrics) - / sum(1 for d in training_metrics if k in d) - for k in {k for d in training_metrics for k in d} - if k != "num_gradient_steps" + avg_metrics = average_metric_samples(training_metrics) + summary = summarize_trajectory_groups(groups_list) + avg_metrics.setdefault( + "time/step_trainer_s", time.monotonic() - trainer_started + ) + avg_metrics.update( + { + key: value + for key, value in build_training_summary_metrics( + summary, + include_trainable_groups=True, + ).items() + if key not in avg_metrics } + ) # Get step and checkpoint path step = await self._get_step(model) @@ -616,12 +699,10 @@ async def _train_model( if verbose: print("Packing tensors...") - # Count submitted groups and trainable groups - num_groups_submitted = len(trajectory_groups) - num_groups_trainable = sum( - 1 - for group in trajectory_groups - if group and len(set(trajectory.reward for trajectory in group)) > 1 + summary = summarize_trajectory_groups(trajectory_groups) + base_metrics = build_training_summary_metrics( + summary, + include_trainable_groups=True, ) packed_tensors = self._get_packed_tensors( @@ -684,29 +765,36 @@ async def _train_model( # Yield metrics showing no groups were trainable # (the frontend will handle logging) yield { - "num_groups_submitted": num_groups_submitted, - "num_groups_trainable": 0, - "num_gradient_steps": 0, + **base_metrics, + "data/step_num_groups_trainable": 0.0, + "train/num_groups_trainable": 0.0, + "data/step_trainer_tokens": 0.0, + TRAIN_GRADIENT_STEPS_KEY: 0.0, } return + base_metrics["data/step_trainer_tokens"] = float( + packed_tensors["assistant_mask"].sum().item() + ) disk_packed_tensors = packed_tensors_to_dir( packed_tensors, f"{get_model_dir(model=model, art_path=self._path)}/tensors" ) # Note: scale_learning_rate_by_reward_std_dev is now handled by the frontend (Model.train()) - results: list[dict[str, float]] = [] estimated_gradient_steps = disk_packed_tensors["num_sequences"] pbar = tqdm.tqdm(total=estimated_gradient_steps, desc="train") async for result in service.train( disk_packed_tensors, config, dev_config, verbose ): num_gradient_steps = int( - result.pop("num_gradient_steps", estimated_gradient_steps) + result.pop(TRAIN_GRADIENT_STEPS_KEY, estimated_gradient_steps) ) assert num_gradient_steps == estimated_gradient_steps, ( f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}" ) - results.append(result) - yield {**result, "num_gradient_steps": num_gradient_steps} + yield { + **base_metrics, + **result, + TRAIN_GRADIENT_STEPS_KEY: float(num_gradient_steps), + } pbar.update(1) pbar.set_postfix(result) pbar.close() @@ -793,15 +881,22 @@ async def _train_sft( service = await self._get_service(model) pbar = tqdm.tqdm(total=len(batches), desc="sft train") - total_trainable_tokens = 0 + total_trainable_tokens = sum(batch.num_trainable_tokens for batch in batches) + total_trajectories = len(trajectory_list) batch_count = 0 async for result in service.train_sft(batches, verbose): pbar.update(1) - pbar.set_postfix({"loss": f"{result.get('loss', 0):.4f}"}) - total_trainable_tokens += result.get("num_trainable_tokens", 0) + pbar.set_postfix({"loss": f"{result.get('loss/train', 0):.4f}"}) batch_count += 1 - yield result + yield { + **result, + "data/step_num_trajectories": float(total_trajectories), + "data/step_trainer_tokens": float(total_trainable_tokens), + TRAIN_GRADIENT_STEPS_KEY: float(len(batches)), + "train/num_trajectories": float(total_trajectories), + "train/num_trainable_tokens": float(total_trainable_tokens), + } pbar.close() diff --git a/src/art/megatron/train.py b/src/art/megatron/train.py index 85c36d1fa..02e3b7cd9 100644 --- a/src/art/megatron/train.py +++ b/src/art/megatron/train.py @@ -282,8 +282,8 @@ def print0(*values: Any) -> None: with open("/tmp/megatron_training_log.jsonl", "a+") as log_file: log_msg = json.dumps( { - "loss": loss.item(), - "grad_norm": grad_norm, + "loss/train": loss.item(), + "loss/grad_norm": grad_norm, "probs_corr": probs_corr, } ) diff --git a/src/art/metrics.py b/src/art/metrics.py new file mode 100644 index 000000000..eda9ab9ca --- /dev/null +++ b/src/art/metrics.py @@ -0,0 +1,380 @@ +from __future__ import annotations + +import asyncio +from contextlib import contextmanager +from contextvars import ContextVar, Token +from dataclasses import dataclass +import time +from typing import Any + +from .api_costs import ( + CostExtractor, + TokenPricing, + extract_api_cost, + normalize_model_name, + normalize_provider, +) + +_active_builder: ContextVar["MetricsBuilder"] = ContextVar("_active_metrics_builder") + +_HIERARCHICAL_SECTIONS = {"costs", "time", "data"} +_THROUGHPUT_IDLE_MAPPINGS = { + "throughput/step_trainer_idle_s": "throughput/cum/trainer_idle_s", + "throughput/step_actor_idle_s": "throughput/cum/actor_idle_s", +} + + +def is_cumulative_metric_key(key: str) -> bool: + parts = key.split("/", 2) + return len(parts) >= 2 and parts[1] == "cum" + + +def is_builder_managed_metric(key: str) -> bool: + return key.startswith(("costs/", "time/step_", "data/step_", "throughput/step_")) + + +def to_cumulative_metric_key(key: str) -> str: + if is_cumulative_metric_key(key): + raise ValueError(f"Metric key '{key}' is already cumulative.") + + section, rest = key.split("/", 1) + if rest.startswith("step_"): + rest = rest[len("step_") :] + return f"{section}/cum/{rest}" + + +@dataclass +class _SharedMetricsState: + lock: asyncio.Lock + step_buffer: dict[str, float] + cum_state: dict[str, float] + unique_scenario_ids: set[str] + pending_scenario_ids: set[str] + cost_extractors: dict[str, CostExtractor] + model_pricing: dict[str, TokenPricing] + + +def _new_shared_metrics_state() -> _SharedMetricsState: + return _SharedMetricsState( + lock=asyncio.Lock(), + step_buffer={}, + cum_state={}, + unique_scenario_ids=set(), + pending_scenario_ids=set(), + cost_extractors={}, + model_pricing={}, + ) + + +class MetricsBuilder: + """Build and accumulate step-level metrics for logging.""" + + def __init__( + self, + cost_context: str, + *, + _shared_state: _SharedMetricsState | None = None, + ) -> None: + if not cost_context: + raise ValueError("cost_context must be non-empty") + + self.cost_context = cost_context + self._shared_state = ( + _shared_state if _shared_state is not None else _new_shared_metrics_state() + ) + + def add_cost(self, path: str, usd: float) -> None: + if not path: + raise ValueError("Cost path must be non-empty") + full_key = f"costs/{path}" + self.add_metric(full_key, float(usd)) + + def add_response_cost( + self, + source: str, + response: Any, + *, + provider: str, + model_name: str, + prompt_price_per_million: float | None = None, + completion_price_per_million: float | None = None, + cached_prompt_price_per_million: float | None = None, + cache_creation_price_per_million: float | None = None, + cache_read_price_per_million: float | None = None, + ) -> float | None: + normalized_source = source.strip("/") + if not normalized_source: + raise ValueError("source must be non-empty") + + cost = extract_api_cost( + response, + provider=provider, + model_name=model_name, + prompt_price_per_million=prompt_price_per_million, + completion_price_per_million=completion_price_per_million, + cached_prompt_price_per_million=cached_prompt_price_per_million, + cache_creation_price_per_million=cache_creation_price_per_million, + cache_read_price_per_million=cache_read_price_per_million, + cost_extractors=self._shared_state.cost_extractors, + model_pricing=self._shared_state.model_pricing, + ) + if cost is None: + return None + + self.add_cost(f"{self.cost_context}/{normalized_source}", cost) + return cost + + def add_metric(self, key: str, value: float) -> None: + if "/" not in key: + raise ValueError("Metric key must include a section prefix") + self._validate_and_add(key, float(value)) + + def add_data( + self, + step_num_scenarios: int | None = None, + step_actor_tokens: int | None = None, + scenario_ids: list[str] | None = None, + ) -> None: + if step_num_scenarios is not None: + self.add_metric("data/step_num_scenarios", float(step_num_scenarios)) + if step_actor_tokens is not None: + self.add_metric("data/step_actor_tokens", float(step_actor_tokens)) + if scenario_ids is not None: + self._shared_state.pending_scenario_ids.update( + str(scenario_id) for scenario_id in scenario_ids + ) + + def add_user_timing( + self, + step_wall_s: float | None = None, + step_actor_s: float | None = None, + step_eval_s: float | None = None, + ) -> None: + if step_wall_s is not None: + self.add_metric("time/step_wall_s", float(step_wall_s)) + if step_actor_s is not None: + self.add_metric("time/step_actor_s", float(step_actor_s)) + if step_eval_s is not None: + self.add_metric("time/step_eval_s", float(step_eval_s)) + + def add_idle_times( + self, + step_trainer_idle_s: float | None = None, + step_actor_idle_s: float | None = None, + ) -> None: + if step_trainer_idle_s is not None: + self.add_metric( + "throughput/step_trainer_idle_s", + float(step_trainer_idle_s), + ) + if step_actor_idle_s is not None: + self.add_metric("throughput/step_actor_idle_s", float(step_actor_idle_s)) + + @contextmanager + def measure(self, key: str): + started = time.monotonic() + try: + yield + finally: + self.add_metric(key, time.monotonic() - started) + + async def flush(self) -> dict[str, float]: + async with self._shared_state.lock: + result = dict(self._shared_state.step_buffer) + cost_metrics = { + key: value + for key, value in self._shared_state.step_buffer.items() + if key.startswith("costs/") + } + result.update(self._compute_rollups(cost_metrics)) + + for key, value in list(result.items()): + section = key.split("/", 1)[0] + if section not in _HIERARCHICAL_SECTIONS: + continue + cum_key = to_cumulative_metric_key(key) + next_value = self._shared_state.cum_state.get(cum_key, 0.0) + value + self._shared_state.cum_state[cum_key] = next_value + result[cum_key] = next_value + + if self._shared_state.pending_scenario_ids: + self._shared_state.unique_scenario_ids.update( + self._shared_state.pending_scenario_ids + ) + result["data/cum/num_unique_scenarios"] = float( + len(self._shared_state.unique_scenario_ids) + ) + + self._update_throughput_metrics(result) + self._shared_state.step_buffer.clear() + self._shared_state.pending_scenario_ids.clear() + return result + + def activate(self) -> Token["MetricsBuilder"]: + return _active_builder.set(self) + + @contextmanager + def activate_context(self): + token = self.activate() + try: + yield self + finally: + token.var.reset(token) + + @staticmethod + def get_active() -> "MetricsBuilder": + return _active_builder.get() + + def for_cost_context(self, cost_context: str) -> "MetricsBuilder": + normalized_cost_context = cost_context.strip() + if not normalized_cost_context: + raise ValueError("cost_context must be non-empty") + if normalized_cost_context == self.cost_context: + return self + return MetricsBuilder( + cost_context=normalized_cost_context, + _shared_state=self._shared_state, + ) + + def register_cost_extractor(self, provider: str, extractor: CostExtractor) -> None: + normalized_provider = normalize_provider(provider) + if normalized_provider is None: + raise ValueError("provider must be non-empty") + self._shared_state.cost_extractors[normalized_provider] = extractor + + def register_model_pricing( + self, + model_name: str, + *, + prompt_per_million: float, + completion_per_million: float, + cached_prompt_per_million: float | None = None, + cache_creation_per_million: float | None = None, + cache_read_per_million: float | None = None, + ) -> None: + normalized_model_name = normalize_model_name(model_name) + if not normalized_model_name: + raise ValueError("model_name must be non-empty") + self._shared_state.model_pricing[normalized_model_name] = TokenPricing( + prompt_per_million=float(prompt_per_million), + completion_per_million=float(completion_per_million), + cached_prompt_per_million=( + float(cached_prompt_per_million) + if cached_prompt_per_million is not None + else None + ), + cache_creation_per_million=( + float(cache_creation_per_million) + if cache_creation_per_million is not None + else None + ), + cache_read_per_million=( + float(cache_read_per_million) + if cache_read_per_million is not None + else None + ), + ) + + def state_dict(self) -> dict[str, Any]: + return { + "cum_state": dict(self._shared_state.cum_state), + "unique_scenario_ids": list(self._shared_state.unique_scenario_ids), + } + + def load_state_dict(self, state: dict[str, Any]) -> None: + raw_cum_state = state.get("cum_state", {}) + raw_unique_ids = state.get("unique_scenario_ids", []) + restored_cum_state = {str(k): float(v) for k, v in raw_cum_state.items()} + restored_unique_ids = {str(v) for v in raw_unique_ids} + + self._shared_state.cum_state.clear() + self._shared_state.cum_state.update(restored_cum_state) + self._shared_state.unique_scenario_ids.clear() + self._shared_state.unique_scenario_ids.update(restored_unique_ids) + self._shared_state.pending_scenario_ids.clear() + + def _validate_and_add(self, key: str, value: float) -> None: + if is_cumulative_metric_key(key): + raise ValueError( + f"Metric key '{key}' uses the reserved cumulative namespace." + ) + + for existing_key in self._shared_state.step_buffer: + if existing_key == key: + continue + if existing_key.startswith(f"{key}/"): + raise ValueError( + f"Cannot log '{key}' as a leaf: it is an ancestor of '{existing_key}'." + ) + if key.startswith(f"{existing_key}/"): + raise ValueError( + f"Cannot log '{key}' as a leaf: '{existing_key}' is already a leaf ancestor." + ) + + self._shared_state.step_buffer[key] = ( + self._shared_state.step_buffer.get(key, 0.0) + value + ) + + def _compute_rollups(self, cost_metrics: dict[str, float]) -> dict[str, float]: + if not cost_metrics: + return {} + + all_parents: set[str] = set() + for key in cost_metrics: + parts = key.split("/") + for depth in range(2, len(parts)): + all_parents.add("/".join(parts[:depth])) + + rollups: dict[str, float] = {} + for parent in all_parents: + prefix = f"{parent}/" + rollups[parent] = sum( + value for key, value in cost_metrics.items() if key.startswith(prefix) + ) + + top_level_children = {key.split("/")[1] for key in cost_metrics} + costs_all = 0.0 + for child_name in top_level_children: + child_key = f"costs/{child_name}" + if child_key in rollups: + costs_all += rollups[child_key] + else: + costs_all += cost_metrics[child_key] + rollups["costs/all"] = costs_all + + return rollups + + def _update_throughput_metrics(self, result: dict[str, float]) -> None: + for step_key, cum_key in _THROUGHPUT_IDLE_MAPPINGS.items(): + if step_key not in result: + continue + next_value = ( + self._shared_state.cum_state.get(cum_key, 0.0) + result[step_key] + ) + self._shared_state.cum_state[cum_key] = next_value + result[cum_key] = next_value + + if "data/step_trainer_tokens" in result or "time/step_trainer_s" in result: + trainer_tokens = self._shared_state.cum_state.get("data/cum/trainer_tokens") + trainer_seconds = self._shared_state.cum_state.get("time/cum/trainer_s") + if ( + trainer_tokens is not None + and trainer_seconds is not None + and trainer_seconds > 0 + ): + result["throughput/avg_trainer_tok_per_s"] = ( + trainer_tokens / trainer_seconds + ) + + if "data/step_actor_tokens" in result or "time/step_actor_s" in result: + actor_tokens = self._shared_state.cum_state.get("data/cum/actor_tokens") + actor_seconds = self._shared_state.cum_state.get("time/cum/actor_s") + if ( + actor_tokens is not None + and actor_seconds is not None + and actor_seconds > 0 + ): + result["throughput/avg_actor_tok_per_s"] = actor_tokens / actor_seconds + + +from .api_costs import track_api_cost diff --git a/src/art/metrics_taxonomy.py b/src/art/metrics_taxonomy.py new file mode 100644 index 000000000..6965b68db --- /dev/null +++ b/src/art/metrics_taxonomy.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Iterable + +from .trajectories import TrajectoryGroup + +TRAIN_GRADIENT_STEPS_KEY = "data/step_num_gradient_steps" +_INVARIANT_METRIC_KEYS = frozenset({TRAIN_GRADIENT_STEPS_KEY}) + + +def average_metric_samples( + metric_samples: Iterable[dict[str, float]], +) -> dict[str, float]: + totals: dict[str, float] = {} + counts: dict[str, int] = {} + invariant_values: dict[str, float] = {} + + for sample in metric_samples: + for key, value in sample.items(): + numeric_value = float(value) + if key in _INVARIANT_METRIC_KEYS: + previous_value = invariant_values.get(key) + if previous_value is None: + invariant_values[key] = numeric_value + elif previous_value != numeric_value: + raise ValueError( + f"Metric '{key}' must be invariant across samples, " + f"got {previous_value} and {numeric_value}." + ) + + totals[key] = totals.get(key, 0.0) + numeric_value + counts[key] = counts.get(key, 0) + 1 + + return { + key: ( + invariant_values[key] + if key in _INVARIANT_METRIC_KEYS + else totals[key] / counts[key] + ) + for key in totals + } + + +@dataclass(frozen=True) +class TrajectoryBatchSummary: + num_scenarios: int + num_trajectories: int + num_groups_submitted: int + num_groups_trainable: int + scenario_ids: list[str] + + +def summarize_trajectory_groups( + trajectory_groups: Iterable[TrajectoryGroup], +) -> TrajectoryBatchSummary: + groups = list(trajectory_groups) + scenario_ids: list[str] = [] + seen_scenario_ids: set[str] = set() + + for group in groups: + scenario_id = _extract_scenario_id(group) + if scenario_id is None or scenario_id in seen_scenario_ids: + continue + seen_scenario_ids.add(scenario_id) + scenario_ids.append(scenario_id) + + return TrajectoryBatchSummary( + num_scenarios=len(groups), + num_trajectories=sum( + len(group.trajectories) + len(group.exceptions) for group in groups + ), + num_groups_submitted=len(groups), + num_groups_trainable=sum(1 for group in groups if _group_is_trainable(group)), + scenario_ids=scenario_ids, + ) + + +def build_data_metrics_from_summary( + summary: TrajectoryBatchSummary, + *, + include_trainable_groups: bool, +) -> dict[str, float]: + metrics = { + "data/step_num_scenarios": float(summary.num_scenarios), + "data/step_num_trajectories": float(summary.num_trajectories), + "data/step_num_groups_submitted": float(summary.num_groups_submitted), + } + if include_trainable_groups: + metrics["data/step_num_groups_trainable"] = float(summary.num_groups_trainable) + return metrics + + +def build_train_metrics_from_summary( + summary: TrajectoryBatchSummary, +) -> dict[str, float]: + return { + "train/num_groups_submitted": float(summary.num_groups_submitted), + "train/num_groups_trainable": float(summary.num_groups_trainable), + "train/num_trajectories": float(summary.num_trajectories), + } + + +def build_training_summary_metrics( + summary: TrajectoryBatchSummary, + *, + include_trainable_groups: bool, +) -> dict[str, float]: + return { + **build_data_metrics_from_summary( + summary, + include_trainable_groups=include_trainable_groups, + ), + **build_train_metrics_from_summary(summary), + } + + +def _group_is_trainable(group: TrajectoryGroup) -> bool: + rewards = [trajectory.reward for trajectory in group.trajectories] + return len(rewards) > 1 and len(set(rewards)) > 1 + + +def _extract_scenario_id(group: TrajectoryGroup) -> str | None: + for metadata in [ + group.metadata, + *(trajectory.metadata for trajectory in group.trajectories), + ]: + scenario_id = _extract_scenario_id_from_metadata(metadata) + if scenario_id is not None: + return scenario_id + return None + + +def _extract_scenario_id_from_metadata( + metadata: dict[str, Any], +) -> str | None: + scenario_id = metadata.get("scenario_id") + if scenario_id is None: + return None + return str(scenario_id) diff --git a/src/art/model.py b/src/art/model.py index a5b135824..a5ea06c07 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -1,7 +1,9 @@ import asyncio +from contextvars import Token from datetime import datetime import json import os +import time from typing import TYPE_CHECKING, Any, Generic, Iterable, Optional, cast, overload import warnings @@ -13,6 +15,14 @@ from . import dev from .costs import CostCalculator +from .metrics import MetricsBuilder, is_builder_managed_metric +from .metrics_taxonomy import ( + TRAIN_GRADIENT_STEPS_KEY, + average_metric_samples, + build_data_metrics_from_summary, + build_train_metrics_from_summary, + summarize_trajectory_groups, +) from .trajectories import Trajectory, TrajectoryGroup from .types import TrainConfig, TrainSFTConfig from .utils.trajectory_logging import write_trajectory_groups_parquet @@ -26,9 +36,20 @@ ModelConfig = TypeVar("ModelConfig", bound=BaseModel | None) StateType = TypeVar("StateType", bound=dict[str, Any], default=dict[str, Any]) -COSTS_STATE_KEY = "_costs" -COSTS_METRIC_PREFIX = "costs_" -COSTS_TOTAL_KEY = f"{COSTS_METRIC_PREFIX}total" +METRICS_BUILDER_STATE_KEY = "_metrics_builder_state" +METRIC_SECTIONS = frozenset( + { + "reward", + "loss", + "offpolicy", + "pipeline", + "throughput", + "costs", + "time", + "data", + } +) +METRIC_SPLITS = frozenset({"train", "val", "test"}) class Model( @@ -93,7 +114,13 @@ class Model( _s3_prefix: str | None = None _openai_client: AsyncOpenAI | None = None _wandb_run: Optional["Run"] = None # Private, for lazy wandb initialization - _costs_lock: asyncio.Lock + _wandb_defined_metrics: set[str] + _run_start_time: float + _run_start_monotonic: float + _last_local_train_log_monotonic: float + _last_local_train_step: int | None + _metrics_builder: MetricsBuilder + _metrics_builder_state_loaded: bool _cost_calculator: CostCalculator def __init__( @@ -123,6 +150,17 @@ def __init__( report_metrics=report_metrics, **kwargs, ) + object.__setattr__(self, "_wandb_defined_metrics", set()) + object.__setattr__(self, "_run_start_time", time.time()) + object.__setattr__(self, "_run_start_monotonic", time.monotonic()) + object.__setattr__( + self, "_last_local_train_log_monotonic", self._run_start_monotonic + ) + object.__setattr__(self, "_last_local_train_step", None) + object.__setattr__( + self, "_metrics_builder", MetricsBuilder(cost_context="train") + ) + object.__setattr__(self, "_metrics_builder_state_loaded", False) @overload def __new__( @@ -376,13 +414,28 @@ def _get_wandb_run(self) -> Optional["Run"]: ), ) self._wandb_run = run + object.__setattr__( + self, + "_wandb_defined_metrics", + { + "training_step", + "time/wall_clock_sec", + }, + ) # Define training_step as the x-axis for all metrics. # This allows out-of-order logging (e.g., async validation for previous steps). wandb.define_metric("training_step") + wandb.define_metric("time/wall_clock_sec") + wandb.define_metric("reward/*", step_metric="training_step") + wandb.define_metric("loss/*", step_metric="training_step") + wandb.define_metric("throughput/*", step_metric="training_step") + wandb.define_metric("costs/*", step_metric="training_step") + wandb.define_metric("time/*", step_metric="training_step") + wandb.define_metric("data/*", step_metric="training_step") wandb.define_metric("train/*", step_metric="training_step") wandb.define_metric("val/*", step_metric="training_step") - wandb.define_metric("costs/*", step_metric="training_step") + wandb.define_metric("test/*", step_metric="training_step") return self._wandb_run def _log_metrics( @@ -392,7 +445,24 @@ def _log_metrics( step: int, ) -> None: """Log metrics to history.jsonl and optionally wandb.""" - prefixed = {f"{split}/{k}": v for k, v in metrics.items()} + if split in METRIC_SPLITS: + prefixed = {} + for key, value in metrics.items(): + first_component = key.split("/", 1)[0] + has_prefix_component = "/" in key + if has_prefix_component and ( + first_component in METRIC_SECTIONS + or first_component in METRIC_SPLITS + ): + prefixed[key] = value + else: + prefixed[f"{split}/{key}"] = value + else: + prefixed = {f"{split}/{k}": v for k, v in metrics.items()} + + prefixed["training_step"] = step + prefixed["time/wall_clock_sec"] = time.time() - self._run_start_time + output_dir = self._get_output_dir() # Ensure output directory exists @@ -416,65 +486,158 @@ def _log_metrics( ) or (self.report_metrics is not None and "wandb" in self.report_metrics) if should_log_wandb: if run := self._get_wandb_run(): - run.log({"training_step": step, **prefixed}) + self._define_wandb_step_metrics(prefixed.keys()) + # Let W&B use its own monotonically increasing history step. + # ART's `training_step` remains the x-axis via define_metric, + # which preserves out-of-order eval logging. + run.log(prefixed) - async def _record_costs( + def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None: + import wandb + + for key in keys: + if not key.startswith("costs/"): + continue + if key in self._wandb_defined_metrics: + continue + wandb.define_metric(key, step_metric="training_step") + self._wandb_defined_metrics.add(key) + + def _route_metrics_and_collect_non_costs( + self, metrics: dict[str, float], split: str + ) -> dict[str, float]: + non_cost_metrics: dict[str, float] = {} + for metric, value in metrics.items(): + numeric_value = float(value) + if metric.startswith("costs/"): + self._metrics_builder.add_cost(metric[len("costs/") :], numeric_value) + continue + if metric.startswith("costs_"): + raise ValueError( + "Legacy cost keys like 'costs_prefill' are no longer supported. " + "Log hierarchical costs like 'costs/train/prefill' or " + "'costs/eval/prefill' instead." + ) + if is_builder_managed_metric(metric): + self._metrics_builder.add_metric(metric, numeric_value) + continue + non_cost_metrics[metric] = numeric_value + return non_cost_metrics + + def _collect_automatic_backend_metrics( self, + *, split: str, step: int, + provided_metric_keys: set[str], + ) -> dict[str, float]: + if split != "train" or self._backend is None: + return {} + + supports_step_metrics = getattr( + self._backend, "supports_automatic_train_step_metrics", None + ) + if not callable(supports_step_metrics) or not supports_step_metrics(): + return {} + + if self._last_local_train_step == step: + return {} + + now = time.monotonic() + step_wall_s = max(0.0, now - self._last_local_train_log_monotonic) + object.__setattr__(self, "_last_local_train_log_monotonic", now) + object.__setattr__(self, "_last_local_train_step", step) + + automatic_metrics: dict[str, float] = {} + if "time/step_wall_s" not in provided_metric_keys: + automatic_metrics["time/step_wall_s"] = step_wall_s + + gpu_cost_getter = getattr( + self._backend, "automatic_gpu_cost_per_hour_usd", None + ) + if callable(gpu_cost_getter) and "costs/gpu" not in provided_metric_keys: + gpu_cost_per_hour_usd = gpu_cost_getter(self) + if gpu_cost_per_hour_usd is not None: + automatic_metrics["costs/gpu"] = ( + step_wall_s * float(gpu_cost_per_hour_usd) / 3600.0 + ) + + return automatic_metrics + + def _add_default_step_metrics( + self, + trajectory_groups: list[TrajectoryGroup], *, - cost_components: dict[str, float], - cost_total_direct: float, - cost_seen: bool, - ) -> None: - component_total = sum(cost_components.values()) - step_total = component_total if component_total > 0 else cost_total_direct - if not cost_seen or step_total <= 0: + split: str, + provided_metric_keys: set[str], + ) -> dict[str, float]: + if split not in METRIC_SPLITS: + return {} + + summary = summarize_trajectory_groups(trajectory_groups) + default_data_metrics = build_data_metrics_from_summary( + summary, + include_trainable_groups=split == "train", + ) + for key, value in default_data_metrics.items(): + if key in provided_metric_keys: + continue + self._metrics_builder.add_metric(key, value) + + if summary.scenario_ids: + self._metrics_builder.add_data(scenario_ids=summary.scenario_ids) + + if split != "train": + return {} + + default_train_metrics = build_train_metrics_from_summary(summary) + return { + key: value + for key, value in default_train_metrics.items() + if key not in provided_metric_keys + } + + def metrics_builder(self, cost_context: str | None = None) -> MetricsBuilder: + self._load_metrics_builder_state() + if cost_context is None: + return self._metrics_builder + return self._metrics_builder.for_cost_context(cost_context) + + def activate_metrics_context(self, cost_context: str) -> Token[MetricsBuilder]: + return self.metrics_builder(cost_context).activate() + + def _load_metrics_builder_state(self) -> None: + if self._metrics_builder_state_loaded: return + state = self.read_state() or {} + metrics_state = state.get(METRICS_BUILDER_STATE_KEY) + if isinstance(metrics_state, dict): + self._metrics_builder.load_state_dict(metrics_state) + object.__setattr__(self, "_metrics_builder_state_loaded", True) + + def _persist_metrics_builder_state(self) -> None: + self.merge_state( + {METRICS_BUILDER_STATE_KEY: self._metrics_builder.state_dict()} + ) - async with self._costs_lock: - existing_state = self.read_state() or {} - raw_costs = existing_state.get(COSTS_STATE_KEY) or {} - cumulative = { - key: float(value) - for key, value in raw_costs.items() - if isinstance(value, (int, float)) - } - last_steps = raw_costs.get("_last_steps") - if not isinstance(last_steps, dict): - last_steps = {} - last_step = last_steps.get(split) - - if isinstance(last_step, (int, float)) and int(last_step) >= step: - for component, value in cost_components.items(): - if value == 0: - continue - cumulative_key = f"{split}_{component}" - cumulative[cumulative_key] = max( - cumulative.get(cumulative_key, 0.0), value - ) - cumulative[split] = max(cumulative.get(split, 0.0), step_total) - cumulative["total"] = max( - cumulative.get("total", 0.0), cumulative.get(split, 0.0) - ) - self.merge_state( - {COSTS_STATE_KEY: {**cumulative, "_last_steps": last_steps}} - ) - self._log_metrics(cumulative, "costs", step) - return - - for component, value in cost_components.items(): - if value == 0: - continue - cumulative_key = f"{split}_{component}" - cumulative[cumulative_key] = cumulative.get(cumulative_key, 0.0) + value - cumulative[split] = cumulative.get(split, 0.0) + step_total - cumulative["total"] = cumulative.get("total", 0.0) + step_total - last_steps[split] = step - self.merge_state( - {COSTS_STATE_KEY: {**cumulative, "_last_steps": last_steps}} - ) - self._log_metrics(cumulative, "costs", step) + def _normalize_trajectory_groups( + self, + trajectories: Iterable[Trajectory | BaseException] | Iterable[TrajectoryGroup], + ) -> list[TrajectoryGroup]: + items = list(trajectories) + if not items: + return [] + + if all(isinstance(item, TrajectoryGroup) for item in items): + return cast(list[TrajectoryGroup], items) + + if all(isinstance(item, (Trajectory, BaseException)) for item in items): + return [TrajectoryGroup(cast(Iterable[Trajectory | BaseException], items))] + + raise TypeError( + "trajectories must be an iterable of TrajectoryGroup objects or " + "an iterable of Trajectory/BaseException items" + ) async def log( self, @@ -506,58 +669,45 @@ async def log( if step is None: step = await self.get_step() if self.trainable else 0 + self._load_metrics_builder_state() + # If only metrics provided (no trajectories), just log them and return if trajectories is None: if metrics is not None: - cost_step = await self.get_step() - cost_components: dict[str, float] = {} - cost_total_direct = 0.0 - cost_seen = False - - for metric, value in metrics.items(): - if not isinstance(value, (int, float)): - continue - if metric == COSTS_TOTAL_KEY: - raise ValueError( - "Do not log 'costs_total' directly. Log costs_* components " - "(e.g., costs_prefill, costs_sample) and totals are derived." - ) - elif metric.startswith(COSTS_METRIC_PREFIX): - component = metric[len(COSTS_METRIC_PREFIX) :] - if component: - cost_components[component] = cost_components.get( - component, 0.0 - ) + float(value) - cost_seen = True - - metrics_without_costs = { - key: value - for key, value in metrics.items() - if not key.startswith(COSTS_METRIC_PREFIX) - } - if metrics_without_costs: - self._log_metrics(metrics_without_costs, split, step) - - await self._record_costs( - split, - cost_step, - cost_components=cost_components, - cost_total_direct=cost_total_direct, - cost_seen=cost_seen, + provided_metric_keys = set(metrics) + automatic_metrics = self._collect_automatic_backend_metrics( + split=split, + step=step, + provided_metric_keys=provided_metric_keys, + ) + if automatic_metrics: + self._route_metrics_and_collect_non_costs(automatic_metrics, split) + metrics_without_costs = self._route_metrics_and_collect_non_costs( + metrics, split ) + builder_metrics = await self._metrics_builder.flush() + merged_metrics = {**metrics_without_costs, **builder_metrics} + if merged_metrics: + self._log_metrics(merged_metrics, split, step) + self._persist_metrics_builder_state() return - # Convert to list[TrajectoryGroup] - if any(isinstance(t, Trajectory) for t in trajectories) or any( - isinstance(t, BaseException) for t in trajectories - ): - trajectory_groups = [ - TrajectoryGroup( - cast(Iterable[Trajectory | BaseException], trajectories) - ) - ] - else: - trajectory_groups = cast(list[TrajectoryGroup], list(trajectories)) + trajectory_groups = self._normalize_trajectory_groups(trajectories) + provided_metric_keys = set(metrics or {}) + + automatic_metrics = self._collect_automatic_backend_metrics( + split=split, + step=step, + provided_metric_keys=provided_metric_keys, + ) + if automatic_metrics: + self._route_metrics_and_collect_non_costs(automatic_metrics, split) + + default_train_metrics = self._add_default_step_metrics( + trajectory_groups, + split=split, + provided_metric_keys=provided_metric_keys, + ) # Ensure output directories exist output_dir = self._get_output_dir() @@ -571,59 +721,53 @@ async def log( ) # 2. Calculate aggregate metrics (excluding additive costs) - cost_step = await self.get_step() - all_metrics: dict[str, list[float]] = {"reward": [], "exception_rate": []} + reward_key = "reward/mean" if split == "train" else "reward" + exception_rate_key = ( + "reward/exception_rate" if split == "train" else "exception_rate" + ) + reward_std_dev_key = "reward/std_dev" if split == "train" else "reward_std_dev" + + all_metrics: dict[str, list[float]] = { + reward_key: [], + exception_rate_key: [], + } group_metrics: dict[str, list[float]] = {} - cost_components: dict[str, float] = {} - cost_total_direct = 0.0 - cost_seen = False - - def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None: - nonlocal cost_total_direct, cost_seen - for metric, value in metrics_dict.items(): - if not isinstance(value, (int, float)): - continue - if metric == COSTS_TOTAL_KEY: - raise ValueError( - "Do not log 'costs_total' directly. Log costs_* components " - "(e.g., costs_prefill, costs_sample) and totals are derived." - ) - elif metric.startswith(COSTS_METRIC_PREFIX): - component = metric[len(COSTS_METRIC_PREFIX) :] - if component: - cost_components[component] = cost_components.get( - component, 0.0 - ) + float(value) - cost_seen = True for group in trajectory_groups: if group.metrics: - _add_costs(group.metrics) + group_non_cost = self._route_metrics_and_collect_non_costs( + cast(dict[str, float], group.metrics), split + ) + else: + group_non_cost = {} if group.trajectories: - for metric, value in group.metrics.items(): - if metric.startswith(COSTS_METRIC_PREFIX): - continue + for metric, value in group_non_cost.items(): if metric not in group_metrics: group_metrics[metric] = [] group_metrics[metric].append(float(value)) - for trajectory in group: - if isinstance(trajectory, BaseException): - all_metrics["exception_rate"].append(1) - continue - else: - all_metrics["exception_rate"].append(0) - # Add reward metric - all_metrics["reward"].append(trajectory.reward) + + all_metrics[exception_rate_key].extend(0.0 for _ in group.trajectories) + all_metrics[exception_rate_key].extend(1.0 for _ in group.exceptions) + + for trajectory in group.trajectories: + all_metrics[reward_key].append(trajectory.reward) # Collect other custom metrics + trajectory_metrics: dict[str, float] = {} for metric, value in trajectory.metrics.items(): - if metric.startswith(COSTS_METRIC_PREFIX): - continue + routed_metric = metric + if split == "train" and "/" not in routed_metric: + routed_metric = f"reward/{routed_metric}" + trajectory_metrics[routed_metric] = float(value) + + non_cost_trajectory_metrics = self._route_metrics_and_collect_non_costs( + trajectory_metrics, + split, + ) + for metric, value in non_cost_trajectory_metrics.items(): if metric not in all_metrics: all_metrics[metric] = [] all_metrics[metric].append(float(value)) - if trajectory.metrics: - _add_costs(trajectory.metrics) # Calculate averages for all metrics averages: dict[str, float] = {} @@ -631,39 +775,38 @@ def _add_costs(metrics_dict: dict[str, float | int | bool]) -> None: if len(values) > 0: averages[metric] = sum(values) / len(values) + averages.update(default_train_metrics) + # Aggregate group-level metrics once per group for metric, values in group_metrics.items(): if len(values) > 0: - averages[f"group_metric_{metric}"] = sum(values) / len(values) + group_key = ( + f"reward/group_{metric}" + if split == "train" + else f"group_metric_{metric}" + ) + averages[group_key] = sum(values) / len(values) # Calculate average standard deviation of rewards within groups from .utils.old_benchmarking.calculate_step_metrics import ( calculate_step_std_dev, ) - averages["reward_std_dev"] = calculate_step_std_dev(trajectory_groups) + averages[reward_std_dev_key] = calculate_step_std_dev(trajectory_groups) # Merge in any additional metrics passed directly if metrics is not None: - _add_costs(metrics) - metrics_without_costs = { - key: value - for key, value in metrics.items() - if not key.startswith(COSTS_METRIC_PREFIX) - } + metrics_without_costs = self._route_metrics_and_collect_non_costs( + metrics, split + ) averages.update(metrics_without_costs) - # 3. Log metrics (writes to history.jsonl and wandb) - self._log_metrics(averages, split, step) - - # 4. Log cumulative costs (additive) - await self._record_costs( - split, - cost_step, - cost_components=cost_components, - cost_total_direct=cost_total_direct, - cost_seen=cost_seen, - ) + # 3. Merge in any builder-managed metrics and log a single row. + builder_metrics = await self._metrics_builder.flush() + merged_metrics = {**averages, **builder_metrics} + if merged_metrics: + self._log_metrics(merged_metrics, split, step) + self._persist_metrics_builder_state() async def get_step(self) -> int: """ @@ -714,7 +857,6 @@ def __init__( report_metrics=report_metrics, **kwargs, ) - object.__setattr__(self, "_costs_lock", asyncio.Lock()) object.__setattr__(self, "_cost_calculator", self._noop_cost_calculator) if _internal_config is not None: # Bypass BaseModel __setattr__ to allow setting private attr @@ -733,7 +875,9 @@ def set_cost_calculator(self, calculator: CostCalculator | None) -> None: @staticmethod def _noop_cost_calculator( - _prompt_tokens: int | None, _completion_tokens: int | None + _prompt_tokens: int | None, + _completion_tokens: int | None, + _cost_context: str, ) -> dict[str, float]: return {} @@ -881,6 +1025,7 @@ async def train( # 1. Train (backend no longer logs internally) training_metrics: list[dict[str, float]] = [] + trainer_started = time.monotonic() async for metrics in self.backend()._train_model( self, groups_list, @@ -889,16 +1034,11 @@ async def train( verbose, ): training_metrics.append(metrics) + trainer_elapsed = time.monotonic() - trainer_started # 2. Calculate aggregated training metrics - avg_metrics: dict[str, float] = {} - if training_metrics: - avg_metrics = { - k: sum(d.get(k, 0) for d in training_metrics) - / sum(1 for d in training_metrics if k in d) - for k in {k for d in training_metrics for k in d} - if k != "num_gradient_steps" - } + avg_metrics = average_metric_samples(training_metrics) + avg_metrics.setdefault("time/step_trainer_s", trainer_elapsed) # 3. Log trajectories and training metrics together (single wandb log call) step = await self.get_step() @@ -929,6 +1069,7 @@ async def train_sft( # Collect all metrics and aggregate them at the end (same as RL) _config = _config or {} # ty:ignore[invalid-assignment] training_metrics: list[dict[str, float]] = [] + trainer_started = time.monotonic() async for metrics in self.backend()._train_sft( self, trajectories, @@ -937,14 +1078,14 @@ async def train_sft( verbose, ): training_metrics.append(metrics) + trainer_elapsed = time.monotonic() - trainer_started # Log aggregated training metrics once (same as RL) if training_metrics: - avg_metrics = { - k: sum(d.get(k, 0) for d in training_metrics) - / sum(1 for d in training_metrics if k in d) - for k in {k for d in training_metrics for k in d} - } + avg_metrics = average_metric_samples(training_metrics) + avg_metrics["time/step_trainer_s"] = trainer_elapsed # Get the current step after training step = await self.get_step() - self._log_metrics(avg_metrics, "train", step) + await self.log( + trajectories=None, split="train", metrics=avg_metrics, step=step + ) diff --git a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py index 52c829750..f9593c240 100644 --- a/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py +++ b/src/art/pipeline_trainer/binary_prefix_tool_pipeline.py @@ -159,12 +159,12 @@ def print_history_summary(model: art.TrainableModel, tail: int = 5) -> None: rows = pl.read_ndjson(str(history_path)).to_dicts() - train_rows = [row for row in rows if "train/reward" in row] + train_rows = [row for row in rows if "reward/mean" in row] print("\nRecent training metrics:") for row in train_rows[-tail:]: step = row["step"] - reward = row["train/reward"] - std_dev = row["train/reward_std_dev"] + reward = row["reward/mean"] + std_dev = row["reward/std_dev"] discarded = row["train/discarded_stale_samples"] off_policy = row["train/steps_off_policy"] print( @@ -229,7 +229,9 @@ async def main() -> None: openai_client = model.openai_client() cost_calculator = model.cost_calculator - async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory: + async def do_rollout( + scenario: Scenario, temp: float, cost_context: str + ) -> art.Trajectory: """Core rollout logic used by both training and eval.""" messages: art.Messages = scenario["messages"] response = await openai_client.chat.completions.create( @@ -265,6 +267,7 @@ async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory: sample_costs = cost_calculator( prompt_tokens, completion_tokens, + cost_context, ) if sample_costs: metrics.update(sample_costs) @@ -281,7 +284,7 @@ async def single_rollout( scenario: Scenario, _config: PipelineConfig, ) -> art.Trajectory: - return await do_rollout(scenario, temperature) + return await do_rollout(scenario, temperature, "train") rollout_fn = make_group_rollout_fn(single_rollout, n=rollouts_per_scenario) @@ -290,7 +293,7 @@ async def single_rollout( async def eval_fn( _model: art.TrainableModel, _step: int, _config: PipelineConfig ) -> list[art.Trajectory]: - tasks = [do_rollout(build_scenario(), eval_temperature)] + tasks = [do_rollout(build_scenario(), eval_temperature, "eval")] results = await asyncio.gather(*tasks, return_exceptions=True) trajectories = [r for r in results if isinstance(r, art.Trajectory)] if trajectories: @@ -312,7 +315,7 @@ def build_scenario() -> Scenario: async def scenario_iter(): for i in range(scenario_count): scenario = build_scenario() - scenario["metadata"] = {"scenario_idx": i} + scenario["metadata"] = {"scenario_id": str(i)} yield scenario config = PipelineConfig( diff --git a/src/art/pipeline_trainer/trainer.py b/src/art/pipeline_trainer/trainer.py index a061636b5..5d569277a 100644 --- a/src/art/pipeline_trainer/trainer.py +++ b/src/art/pipeline_trainer/trainer.py @@ -16,6 +16,8 @@ from .types import ConfigT, EvalFn, RolloutFn, ScenarioT, SingleRolloutFn # noqa: F401 PIPELINE_STATE_KEY = "_pipeline_trainer" +_ROLLOUT_WALL_TIME_KEY = "_art_rollout_wall_s" +_ACTOR_IDLE_TIME_KEY = "_art_actor_idle_s" def _to_async_iterator(iterable: Iterable[T] | AsyncIterator[T]) -> AsyncIterator[T]: @@ -322,13 +324,21 @@ async def _rollout_worker(self, worker_id: int) -> None: self._status.note_rollout_started() errored = False try: + wait_started = time.monotonic() await self._wait_for_policy() + actor_idle_s = time.monotonic() - wait_started if self.state.done: break initial_version = self.state.policy_version - group = await self.rollout_fn(self.model, scenario, self.config) + token = self.model.activate_metrics_context("train") + rollout_started = time.monotonic() + try: + group = await self.rollout_fn(self.model, scenario, self.config) + finally: + token.var.reset(token) + rollout_wall_s = time.monotonic() - rollout_started if not isinstance(group, TrajectoryGroup): errored = True continue @@ -340,7 +350,9 @@ async def _rollout_worker(self, worker_id: int) -> None: ) if self.state.done: break - await self._put_output_group(group) + queue_wait_s = await self._put_output_group(group) + group.metadata[_ROLLOUT_WALL_TIME_KEY] = rollout_wall_s + group.metadata[_ACTOR_IDLE_TIME_KEY] = actor_idle_s + queue_wait_s except asyncio.CancelledError: raise except Exception as exc: @@ -379,13 +391,17 @@ async def _training_stage(self) -> None: if stop_at_step is not None and current_step >= stop_at_step: break step_start = time.monotonic() + collect_started = time.monotonic() batch, discarded, saw_sentinel = await self._collect_batch(current_step) + trainer_idle_s = time.monotonic() - collect_started self.state.discarded_stale_samples += discarded if discarded: self._status.note_stale(discarded) if not batch: break + actor_wall_s, actor_idle_s = self._consume_batch_rollout_timings(batch) + expected_step = current_step + 1 should_eval_step = self._should_eval_step(expected_step) should_checkpoint = self.save_checkpoint and should_eval_step @@ -395,10 +411,9 @@ async def _training_stage(self) -> None: self.state.policy_updated.notify_all() self._status.note_training_start(len(batch)) - train_call_start: float | None = None + train_call_start = time.monotonic() if os.getenv("ART_TRAIN_STEP_LOG"): print(f"[train] step {expected_step} starting (batch={len(batch)})") - train_call_start = time.perf_counter() try: result = await self.backend.train( self.model, @@ -414,8 +429,8 @@ async def _training_stage(self) -> None: self._status.note_training_end() raise finally: - if train_call_start is not None: - train_call_elapsed = time.perf_counter() - train_call_start + train_call_elapsed = time.monotonic() - train_call_start + if os.getenv("ART_TRAIN_STEP_LOG"): print( f"[train] step {expected_step} done in " f"{train_call_elapsed:.1f}s" @@ -438,7 +453,14 @@ async def _training_stage(self) -> None: ), "steps_off_policy": steps_off_policy, "num_groups": float(len(batch)), + "time/step_wall_s": step_seconds, + "throughput/step_trainer_idle_s": trainer_idle_s, } + metrics.setdefault("time/step_trainer_s", train_call_elapsed) + if actor_wall_s > 0: + metrics["time/step_actor_s"] = actor_wall_s + if actor_idle_s > 0: + metrics["throughput/step_actor_idle_s"] = actor_idle_s metrics.update(result.metrics) await self.model.log( @@ -561,14 +583,22 @@ async def _run_eval(self, step: int) -> None: assert self.eval_fn is not None self._status.note_val_started(step) reward: float | None = None + eval_elapsed = 0.0 try: - result = await self.eval_fn(self.model, step, self.config) + token = self.model.activate_metrics_context("eval") + eval_started = time.monotonic() + try: + result = await self.eval_fn(self.model, step, self.config) + finally: + token.var.reset(token) + eval_elapsed = time.monotonic() - eval_started splits: dict[str, list[art.Trajectory | art.TrajectoryGroup]] if isinstance(result, dict): splits = result else: splits = {"val": result} + logged_eval_timing = False for split_name, items in splits.items(): groups, trajectories = self._normalize_eval_items(items) if split_name == "val": @@ -577,7 +607,25 @@ async def _run_eval(self, step: int) -> None: else: reward = None if groups: - await self.model.log(groups, split=split_name, step=step) + metrics = ( + {"time/step_eval_s": eval_elapsed} + if not logged_eval_timing + else None + ) + await self.model.log( + groups, + split=split_name, + step=step, + metrics=metrics, + ) + logged_eval_timing = True + if not logged_eval_timing and eval_elapsed > 0: + await self.model.log( + trajectories=None, + split="val", + step=step, + metrics={"time/step_eval_s": eval_elapsed}, + ) except asyncio.CancelledError: raise except Exception as exc: @@ -630,6 +678,9 @@ def _apply_scenario_metadata( continue if not self._is_scalar_metadata(value): continue + if key == "scenario_id": + group.metadata["scenario_id"] = value + continue group.metadata[f"scenario_{key}"] = value def _is_group_stale(self, group: TrajectoryGroup, min_version: int) -> bool: @@ -734,12 +785,31 @@ def _persist_state(self, training_step: int) -> None: def _is_scalar_metadata(value: object) -> bool: return value is None or isinstance(value, (str, int, float, bool)) - async def _put_output_group(self, group: TrajectoryGroup) -> None: + async def _put_output_group(self, group: TrajectoryGroup) -> float: assert self._output_queue is not None + queue_wait_started = time.monotonic() while not self.state.done: try: await asyncio.wait_for(self._output_queue.put(group), timeout=1.0) self._status.note_group_enqueued(group) - return + return time.monotonic() - queue_wait_started except asyncio.TimeoutError: continue + return time.monotonic() - queue_wait_started + + def _consume_batch_rollout_timings( + self, batch: list[TrajectoryGroup] + ) -> tuple[float, float]: + rollout_wall_s = 0.0 + actor_idle_s = 0.0 + for group in batch: + rollout_wall_s += self._pop_float_metadata(group, _ROLLOUT_WALL_TIME_KEY) + actor_idle_s += self._pop_float_metadata(group, _ACTOR_IDLE_TIME_KEY) + return rollout_wall_s, actor_idle_s + + @staticmethod + def _pop_float_metadata(group: TrajectoryGroup, key: str) -> float: + value = group.metadata.pop(key, 0.0) + if isinstance(value, (int, float)): + return float(value) + return 0.0 diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py index dea0198e7..fcb9f68fb 100644 --- a/src/art/serverless/backend.py +++ b/src/art/serverless/backend.py @@ -1,4 +1,5 @@ import asyncio +import time from typing import TYPE_CHECKING, Any, AsyncIterator, Iterable, Literal import warnings @@ -9,6 +10,12 @@ from .. import dev from ..backend import AnyTrainableModel, Backend +from ..metrics_taxonomy import ( + TRAIN_GRADIENT_STEPS_KEY, + average_metric_samples, + build_training_summary_metrics, + summarize_trajectory_groups, +) from ..trajectories import Trajectory, TrajectoryGroup from ..types import ServerlessTrainResult, TrainConfig, TrainSFTConfig from ..utils.record_provenance import record_provenance @@ -30,6 +37,44 @@ def _extract_step_from_wandb_artifact(artifact: "wandb.Artifact") -> int | None: return None +_UPSTREAM_TRAIN_METRIC_KEYS = { + "reward": "reward/mean", + "reward_std_dev": "reward/std_dev", + "exception_rate": "reward/exception_rate", + "policy_loss": "loss/train", + "loss": "loss/train", + "entropy": "loss/entropy", + "kl_div": "loss/kl_div", + "kl_policy_ref": "loss/kl_policy_ref", + "grad_norm": "loss/grad_norm", + "learning_rate": "loss/learning_rate", + "num_groups_submitted": "train/num_groups_submitted", + "num_groups_trainable": "train/num_groups_trainable", + "num_trajectories": "train/num_trajectories", + "num_trainable_tokens": "train/num_trainable_tokens", + "train_tokens": "data/step_trainer_tokens", + "num_datums": "data/step_num_datums", +} + + +def _canonicalize_upstream_metric_key(metric: str) -> str: + if "/" in metric: + return metric + if metric == "tokens_per_second": + return "" + if metric.startswith("group_metric_"): + return f"reward/group_{metric[len('group_metric_') :]}" + return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) + + +def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]: + return { + canonical_key: float(value) + for key, value in metrics.items() + if (canonical_key := _canonicalize_upstream_metric_key(key)) + } + + class ServerlessBackend(Backend): def __init__( self, *, api_key: str | None = None, base_url: str | None = None @@ -233,20 +278,28 @@ async def train( # type: ignore[override] # Collect metrics from training training_metrics: list[dict[str, float]] = [] + trainer_started = time.monotonic() async for metrics in self._train_model( model, groups_list, config, dev_config, verbose ): training_metrics.append(metrics) # Aggregate metrics - avg_metrics: dict[str, float] = {} - if training_metrics: - avg_metrics = { - k: sum(d.get(k, 0) for d in training_metrics) - / sum(1 for d in training_metrics if k in d) - for k in {k for d in training_metrics for k in d} - if k != "num_gradient_steps" + avg_metrics = average_metric_samples(training_metrics) + summary = summarize_trajectory_groups(groups_list) + avg_metrics.setdefault( + "time/step_trainer_s", time.monotonic() - trainer_started + ) + avg_metrics.update( + { + key: value + for key, value in build_training_summary_metrics( + summary, + include_trainable_groups=True, + ).items() + if key not in avg_metrics } + ) # Get step and artifact name step = await self._get_step(model) @@ -273,6 +326,11 @@ async def _train_model( dev_config: dev.TrainConfig, verbose: bool = False, ) -> AsyncIterator[dict[str, float]]: + summary = summarize_trajectory_groups(trajectory_groups) + base_metrics = build_training_summary_metrics( + summary, + include_trainable_groups=True, + ) assert model.id is not None, "Model ID is required" training_job = await self._client.training_jobs.create( # ty:ignore[possibly-missing-attribute] model_id=model.id, @@ -305,7 +363,14 @@ async def _train_model( assert pbar is not None and num_sequences is not None pbar.update(1) pbar.set_postfix(event.data) - yield {**event.data, "num_gradient_steps": num_sequences} + metrics = _canonicalize_upstream_metrics( + {k: float(v) for k, v in event.data.items()} + ) + yield { + **base_metrics, + **metrics, + TRAIN_GRADIENT_STEPS_KEY: float(num_sequences), + } elif event.type == "training_started": num_sequences = event.data["num_sequences"] if pbar is None: @@ -470,7 +535,15 @@ async def _train_sft( assert pbar is not None and num_batches is not None pbar.update(1) pbar.set_postfix(event.data) - yield {**event.data, "num_gradient_steps": num_batches} + metrics = _canonicalize_upstream_metrics( + {k: float(v) for k, v in event.data.items()} + ) + yield { + **metrics, + "data/step_num_trajectories": float(num_trajectories), + "train/num_trajectories": float(num_trajectories), + TRAIN_GRADIENT_STEPS_KEY: float(num_batches), + } elif event.type == "training_started": num_batches = event.data.get("num_sequences", 0) if pbar is None: diff --git a/src/art/tinker/service.py b/src/art/tinker/service.py index ba6768eb8..1f5970aca 100644 --- a/src/art/tinker/service.py +++ b/src/art/tinker/service.py @@ -80,7 +80,7 @@ def custom_loss_fn( for mask, lp in zip(masks, logprobs_list): logprobs[mask] = lp loss = loss_fn(inputs, logprobs.unsqueeze(0), None, None, _config) - return loss.mean_policy_loss, {"policy_loss": loss.mean_policy_loss.item()} + return loss.mean_policy_loss, {"loss/train": loss.mean_policy_loss.item()} shifted_tokens = shift_tensor(packed_tensors["tokens"], 0) diff --git a/src/art/tinker_native/backend.py b/src/art/tinker_native/backend.py index e5eb1180e..500a850fa 100644 --- a/src/art/tinker_native/backend.py +++ b/src/art/tinker_native/backend.py @@ -30,6 +30,10 @@ from .. import dev from ..backend import Backend from ..costs import build_cost_calculator, compute_train_cost, get_model_pricing +from ..metrics_taxonomy import ( + build_training_summary_metrics, + summarize_trajectory_groups, +) from ..model import Model, TrainableModel from ..tinker.backend import get_renderer_name from ..tinker.server import get_free_port @@ -47,6 +51,35 @@ STATE_KEY_LATEST_STEP = "latest_step" T = TypeVar("T") +_UPSTREAM_TRAIN_METRIC_KEYS = { + "reward": "reward/mean", + "reward_std_dev": "reward/std_dev", + "exception_rate": "reward/exception_rate", + "policy_loss": "loss/train", + "loss": "loss/train", + "entropy": "loss/entropy", + "kl_div": "loss/kl_div", + "kl_policy_ref": "loss/kl_policy_ref", + "grad_norm": "loss/grad_norm", + "learning_rate": "loss/learning_rate", + "num_groups_submitted": "train/num_groups_submitted", + "num_groups_trainable": "train/num_groups_trainable", + "num_trajectories": "train/num_trajectories", + "num_trainable_tokens": "train/num_trainable_tokens", + "train_tokens": "data/step_trainer_tokens", + "num_datums": "data/step_num_datums", +} + + +def _canonicalize_upstream_metric_key(metric: str) -> str: + if "/" in metric: + return metric + if metric == "tokens_per_second": + return "" + if metric.startswith("group_metric_"): + return f"reward/group_{metric[len('group_metric_') :]}" + return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) + @dataclass class ModelState: @@ -208,6 +241,7 @@ async def train( # type: ignore[override] ) -> TrainResult: state = self._model_state[model.name] groups_list = list(trajectory_groups) + summary = summarize_trajectory_groups(groups_list) datums = trajectory_groups_to_datums( groups_list, @@ -217,8 +251,11 @@ async def train( # type: ignore[override] ) metrics: dict[str, float] = { - "num_groups_submitted": float(len(groups_list)), - "num_datums": float(len(datums)), + **build_training_summary_metrics( + summary, + include_trainable_groups=True, + ), + "data/step_num_datums": float(len(datums)), } if not datums: @@ -227,10 +264,13 @@ async def train( # type: ignore[override] train_tokens = 0 for datum in datums: train_tokens += len(datum.model_input.to_ints()) - metrics["train_tokens"] = float(train_tokens) + metrics["data/step_trainer_tokens"] = float(train_tokens) pricing = get_model_pricing(model.base_model) if pricing is not None: - metrics["costs_train"] = compute_train_cost(train_tokens, pricing) + metrics["costs/train/tinker_train"] = compute_train_cost( + train_tokens, pricing + ) + trainer_started = time.monotonic() if adam_params is None: adam_params = tinker.AdamParams( @@ -268,12 +308,16 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum: for key, value in forward_output.metrics.items(): if value is None: continue - metrics[key] = float(value) + canonical_key = _canonicalize_upstream_metric_key(key) + if canonical_key: + metrics[canonical_key] = float(value) if optim_output.metrics: for key, value in optim_output.metrics.items(): if value is None: continue - metrics[key] = float(value) + canonical_key = _canonicalize_upstream_metric_key(key) + if canonical_key: + metrics[canonical_key] = float(value) next_step = state.current_step + 1 checkpoint_name = f"step_{next_step:06d}" @@ -298,6 +342,7 @@ def remove_mask(datum: tinker.Datum) -> tinker.Datum: state.current_step = next_step self._persist_model_state(model, state) + metrics["time/step_trainer_s"] = time.monotonic() - trainer_started return TrainResult(step=state.current_step, metrics=metrics) diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index fad04fbc3..76ab19911 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -13,6 +13,7 @@ from datasets import Dataset import peft import torch +from torch.optim import Optimizer from transformers import GenerationMixin, PreTrainedModel from transformers.tokenization_utils_base import PreTrainedTokenizerBase from trl import GRPOConfig, GRPOTrainer @@ -190,6 +191,13 @@ def save_checkpoint( return checkpoint_dir +def _get_trainer_optimizer(trainer: GRPOTrainer) -> Optimizer: + optimizer = cast(Optimizer | None, getattr(trainer, "optimizer", None)) + if optimizer is None: + raise RuntimeError("Trainer optimizer must be initialized before training") + return optimizer + + # ============================================================================ # Model Classes # ============================================================================ @@ -541,10 +549,11 @@ def _reset_optimizer_if_mode_changed( mode_changed = ( self._last_training_mode is not None and self._last_training_mode != mode ) + optimizer = _get_trainer_optimizer(self._state.trainer) if mode_changed: # Clear all optimizer state (exp_avg, exp_avg_sq, step for each param) - self._state.trainer.optimizer.state.clear() + optimizer.state.clear() self._last_training_mode = mode @@ -576,9 +585,10 @@ async def _train_dedicated( ) -> AsyncIterator[dict[str, float]]: """Train in dedicated mode — no sleep/wake, vLLM keeps running on separate GPU.""" self._reset_optimizer_if_mode_changed("rl") + optimizer = _get_trainer_optimizer(self._state.trainer) rl_weight_decay = 0.1 - for param_group in self._state.trainer.optimizer.param_groups: + for param_group in optimizer.param_groups: param_group["weight_decay"] = rl_weight_decay packed_tensors = packed_tensors_from_dir(**disk_packed_tensors) @@ -661,10 +671,11 @@ async def _train_shared( # Reset optimizer state if switching from SFT to RL self._reset_optimizer_if_mode_changed("rl") + optimizer = _get_trainer_optimizer(self._state.trainer) # Set RL-specific hyperparameters rl_weight_decay = 0.1 - for param_group in self._state.trainer.optimizer.param_groups: + for param_group in optimizer.param_groups: param_group["weight_decay"] = rl_weight_decay # Load packed tensors @@ -794,7 +805,7 @@ async def train_sft( # Get model and optimizer peft_model = self._state.peft_model self._reset_optimizer_if_mode_changed("sft") - optimizer = self._state.trainer.optimizer + optimizer = _get_trainer_optimizer(self._state.trainer) # Set SFT-specific hyperparameters sft_weight_decay = 0.01 @@ -873,12 +884,11 @@ async def train_sft( batch_idx += 1 yield { - "loss": batch_loss, - "learning_rate": batch.learning_rate, - "grad_norm": grad_norm, - "num_trajectories": float(batch.num_trajectories), - "num_trainable_tokens": float(batch.num_trainable_tokens), - "tokens_per_second": tokens_per_second, + "loss/train": batch_loss, + "loss/learning_rate": batch.learning_rate, + "loss/grad_norm": grad_norm, + "train/num_trajectories": float(batch.num_trajectories), + "train/num_trainable_tokens": float(batch.num_trainable_tokens), } # === Cleanup === diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py index fcb7e287a..399c1c728 100644 --- a/src/art/unsloth/train.py +++ b/src/art/unsloth/train.py @@ -3,7 +3,7 @@ from contextlib import contextmanager, nullcontext import gc import os -from typing import TYPE_CHECKING, Callable, cast +from typing import TYPE_CHECKING, Any, Callable, cast import nest_asyncio from peft.peft_model import PeftModel @@ -19,6 +19,43 @@ nest_asyncio.apply() +_UPSTREAM_TRAIN_METRIC_KEYS = { + "reward": "reward/mean", + "reward_std_dev": "reward/std_dev", + "exception_rate": "reward/exception_rate", + "policy_loss": "loss/train", + "loss": "loss/train", + "entropy": "loss/entropy", + "kl_div": "loss/kl_div", + "kl_policy_ref": "loss/kl_policy_ref", + "grad_norm": "loss/grad_norm", + "learning_rate": "loss/learning_rate", + "num_groups_submitted": "train/num_groups_submitted", + "num_groups_trainable": "train/num_groups_trainable", + "num_trajectories": "train/num_trajectories", + "num_trainable_tokens": "train/num_trainable_tokens", + "train_tokens": "data/step_trainer_tokens", + "num_datums": "data/step_num_datums", +} + + +def _canonicalize_upstream_metric_key(metric: str) -> str: + if "/" in metric: + return metric + if metric == "tokens_per_second": + return "" + if metric.startswith("group_metric_"): + return f"reward/group_{metric[len('group_metric_') :]}" + return _UPSTREAM_TRAIN_METRIC_KEYS.get(metric, metric) + + +def _canonicalize_upstream_metrics(metrics: dict[str, float]) -> dict[str, float]: + return { + canonical_key: float(value) + for key, value in metrics.items() + if (canonical_key := _canonicalize_upstream_metric_key(key)) + } + async def train( trainer: "GRPOTrainer", @@ -169,19 +206,21 @@ def compute_loss( _config, ) - trainer._metrics["train"]["learning_rate"].append(config.learning_rate) - trainer._metrics["train"]["policy_loss"].append(loss.mean_policy_loss.item()) + trainer._metrics["train"]["loss/learning_rate"].append(config.learning_rate) + trainer._metrics["train"]["loss/train"].append(loss.mean_policy_loss.item()) if loss.mean_entropy is not None: - trainer._metrics["train"]["entropy"].append(loss.mean_entropy.item()) + trainer._metrics["train"]["loss/entropy"].append(loss.mean_entropy.item()) if loss.kl_policy_ref is not None: - trainer._metrics["train"]["kl_policy_ref"].append(loss.kl_policy_ref.item()) + trainer._metrics["train"]["loss/kl_policy_ref"].append( + loss.kl_policy_ref.item() + ) return loss.mean_policy_loss return compute_loss def get_log_fn( - trainer: "GRPOTrainer", results_queue: asyncio.Queue[dict[str, float]] + trainer: Any, results_queue: asyncio.Queue[dict[str, float]] ) -> Callable[..., None]: def log(logs: dict[str, float], start_time: float | None = None) -> None: metrics = { @@ -189,13 +228,18 @@ def log(logs: dict[str, float], start_time: float | None = None) -> None: } # average the metrics # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs` - # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format. + # start with "eval_". Normalize them into the `val/...` taxonomy instead. if next(iter(logs.keys())).startswith("eval_"): - metrics = {f"eval_{key}": val for key, val in metrics.items()} - - logs = {**logs, **metrics} - logs.pop("learning_rate", None) - results_queue.put_nowait(logs) + normalized_metrics = {f"val/{key}": val for key, val in metrics.items()} + normalized_logs = { + f"val/{_canonicalize_upstream_metric_key(key[len('eval_') :])}": val + for key, val in logs.items() + } + results_queue.put_nowait({**normalized_metrics, **normalized_logs}) + else: + results_queue.put_nowait( + {**_canonicalize_upstream_metrics(logs), **metrics} + ) trainer._metrics["train"].clear() return log diff --git a/tests/integration/test_live_api_cost.py b/tests/integration/test_live_api_cost.py new file mode 100644 index 000000000..ad7438bee --- /dev/null +++ b/tests/integration/test_live_api_cost.py @@ -0,0 +1,224 @@ +import json +import os +from pathlib import Path +import urllib.request +from uuid import uuid4 + +import pytest + +from art import Model +from art.metrics import track_api_cost + +pytestmark = pytest.mark.live_api_cost + +_LIVE_ENV = "ART_RUN_LIVE_API_COST_TESTS" + + +def _require_live_test_env(*required_vars: str) -> None: + if os.environ.get(_LIVE_ENV) != "1": + pytest.skip(f"Set {_LIVE_ENV}=1 to run live API cost tests.") + missing = [name for name in required_vars if not os.environ.get(name)] + if missing: + pytest.skip(f"Missing required env vars: {', '.join(missing)}") + + +def _post_json(url: str, *, headers: dict[str, str], payload: dict) -> dict: + request = urllib.request.Request( + url, + data=json.dumps(payload).encode("utf-8"), + headers=headers, + method="POST", + ) + with urllib.request.urlopen(request, timeout=120) as response: + return json.loads(response.read().decode("utf-8")) + + +def _cacheable_prefix(word_count: int = 1500) -> str: + return " ".join(f"cache-token-{index % 16}" for index in range(word_count)) + + +def _history_rows(history_path: Path) -> list[dict]: + return [json.loads(line) for line in history_path.read_text().splitlines() if line] + + +def _openai_completion(*, api_key: str, prompt_cache_key: str, prefix: str) -> dict: + return _post_json( + "https://api.openai.com/v1/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + payload={ + "model": "gpt-4.1", + "messages": [ + {"role": "system", "content": prefix}, + {"role": "user", "content": "Reply with OK."}, + ], + "temperature": 0, + "max_completion_tokens": 4, + "prompt_cache_key": prompt_cache_key, + }, + ) + + +def _anthropic_message(*, api_key: str, prefix: str) -> dict: + return _post_json( + "https://api.anthropic.com/v1/messages", + headers={ + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + }, + payload={ + "model": "claude-sonnet-4-6", + "max_tokens": 8, + "temperature": 0, + "system": [ + { + "type": "text", + "text": prefix, + "cache_control": {"type": "ephemeral"}, + } + ], + "messages": [ + {"role": "user", "content": "Reply with OK."}, + ], + }, + ) + + +class TestLiveApiCost: + @pytest.mark.asyncio + async def test_openai_gpt_4_1_cached_prompt_cost(self, tmp_path: Path) -> None: + _require_live_test_env("OPENAI_API_KEY") + + api_key = os.environ["OPENAI_API_KEY"] + prefix = _cacheable_prefix() + prompt_cache_key = f"art-live-api-cost-{uuid4()}" + + # Warm the cache first so the tracked request can validate cached pricing. + _openai_completion( + api_key=api_key, + prompt_cache_key=prompt_cache_key, + prefix=prefix, + ) + + model = Model( + name="live-openai-api-cost", + project="live-api-cost", + base_path=str(tmp_path), + report_metrics=[], + ) + + @track_api_cost( + source="llm_judge/openai_cached_prompt", + provider="openai", + model_name="openai/gpt-4.1", + ) + def _judge() -> dict: + return _openai_completion( + api_key=api_key, + prompt_cache_key=prompt_cache_key, + prefix=prefix, + ) + + token = model.activate_metrics_context("eval") + try: + response = _judge() + finally: + token.var.reset(token) + + await model.log(trajectories=None, split="val", step=1, metrics={}) + + usage = response["usage"] + cached_tokens = usage.get("prompt_tokens_details", {}).get("cached_tokens", 0) + assert cached_tokens > 0 + + expected_cost = ( + ((usage["prompt_tokens"] - cached_tokens) * 2.0) + + (cached_tokens * 0.5) + + (usage["completion_tokens"] * 8.0) + ) / 1_000_000 + + history_path = ( + tmp_path + / "live-api-cost" + / "models" + / "live-openai-api-cost" + / "history.jsonl" + ) + row = _history_rows(history_path)[0] + assert row["costs/eval/llm_judge/openai_cached_prompt"] == pytest.approx( + expected_cost + ) + + @pytest.mark.asyncio + async def test_anthropic_claude_sonnet_4_6_prompt_cache_cost( + self, + tmp_path: Path, + ) -> None: + _require_live_test_env("ANTHROPIC_API_KEY") + + api_key = os.environ["ANTHROPIC_API_KEY"] + prefix = _cacheable_prefix() + + model = Model( + name="live-anthropic-api-cost", + project="live-api-cost", + base_path=str(tmp_path), + report_metrics=[], + ) + + @track_api_cost( + source="llm_judge/anthropic_prompt_cache", + provider="anthropic", + model_name="anthropic/claude-sonnet-4-6", + ) + def _judge() -> dict: + return _anthropic_message(api_key=api_key, prefix=prefix) + + token = model.activate_metrics_context("eval") + try: + first_response = _judge() + finally: + token.var.reset(token) + await model.log(trajectories=None, split="val", step=1, metrics={}) + + token = model.activate_metrics_context("eval") + try: + second_response = _judge() + finally: + token.var.reset(token) + await model.log(trajectories=None, split="val", step=2, metrics={}) + + first_usage = first_response["usage"] + second_usage = second_response["usage"] + assert first_usage.get("cache_creation_input_tokens", 0) > 0 + assert second_usage.get("cache_read_input_tokens", 0) > 0 + + first_expected_cost = ( + (first_usage["input_tokens"] * 3.0) + + (first_usage.get("cache_creation_input_tokens", 0) * 3.75) + + (first_usage["output_tokens"] * 15.0) + ) / 1_000_000 + second_expected_cost = ( + (second_usage["input_tokens"] * 3.0) + + (second_usage.get("cache_read_input_tokens", 0) * 0.30) + + (second_usage["output_tokens"] * 15.0) + ) / 1_000_000 + + history_path = ( + tmp_path + / "live-api-cost" + / "models" + / "live-anthropic-api-cost" + / "history.jsonl" + ) + first_row, second_row = _history_rows(history_path) + + assert first_row[ + "costs/eval/llm_judge/anthropic_prompt_cache" + ] == pytest.approx(first_expected_cost) + assert second_row[ + "costs/eval/llm_judge/anthropic_prompt_cache" + ] == pytest.approx(second_expected_cost) diff --git a/tests/unit/test_frontend_logging.py b/tests/unit/test_frontend_logging.py index 202785892..2afb8af6e 100644 --- a/tests/unit/test_frontend_logging.py +++ b/tests/unit/test_frontend_logging.py @@ -17,6 +17,8 @@ import pytest from art import Model, TrainableModel, Trajectory, TrajectoryGroup +from art.local.backend import LocalBackend +from art.metrics_taxonomy import TRAIN_GRADIENT_STEPS_KEY from art.utils.trajectory_logging import read_trajectory_groups_parquet @@ -225,13 +227,12 @@ async def test_history_appends_entries( history_path = tmp_path / "test-project/models/test-model/history.jsonl" df = pl.read_ndjson(str(history_path)) - # Should have 2 entries assert len(df) == 2 # Check both splits are present columns = df.columns assert any("val/" in col for col in columns) - assert any("train/" in col for col in columns) + assert any("reward/" in col for col in columns) class TestPathStructure: @@ -337,10 +338,22 @@ async def test_metric_prefixes(self, tmp_path: Path): entry = json.loads(f.readline()) # All metrics should be prefixed (except step and recorded_at) - metric_keys = [k for k in entry.keys() if k not in ["step", "recorded_at"]] - assert all(k.startswith("val/") for k in metric_keys), ( - f"Not all metrics prefixed: {metric_keys}" + metric_keys = [ + k + for k in entry.keys() + if k + not in [ + "step", + "recorded_at", + "training_step", + "time/wall_clock_sec", + ] + ] + assert all(k.startswith(("val/", "data/")) for k in metric_keys), ( + f"Not all metrics routed into taxonomy namespaces: {metric_keys}" ) + assert entry["training_step"] == 0 + assert entry["time/wall_clock_sec"] >= 0 @pytest.mark.asyncio async def test_standard_metrics_present(self, tmp_path: Path): @@ -455,6 +468,340 @@ async def test_exception_rate_calculation(self, tmp_path: Path): # All successful trajectories = 0% exception rate assert entry["val/exception_rate"] == 0.0 + @pytest.mark.asyncio + async def test_exception_rate_counts_group_exceptions(self, tmp_path: Path): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + trajectory_groups = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.5, + messages_and_choices=[{"role": "user", "content": "test"}], + ) + ], + exceptions=[ValueError("boom")], + ) + ] + + await model.log(trajectory_groups, split="val") + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["val/exception_rate"] == pytest.approx(0.5) + + @pytest.mark.asyncio + async def test_generator_of_trajectories_is_consumed_once(self, tmp_path: Path): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + def trajectories(): + yield Trajectory( + reward=1.0, + metrics={"custom": 1.0}, + messages_and_choices=[{"role": "user", "content": "first"}], + ) + yield Trajectory( + reward=3.0, + metrics={"custom": 3.0}, + messages_and_choices=[{"role": "user", "content": "second"}], + ) + + await model.log(trajectories(), split="val") + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["val/reward"] == pytest.approx(2.0) + assert entry["val/custom"] == pytest.approx(2.0) + + @pytest.mark.asyncio + async def test_train_trajectory_metrics_default_to_reward_prefix( + self, tmp_path: Path + ): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + trajectories = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.7, + metrics={ + "custom_score": 1.0, + "reward/prefixed": 2.0, + }, + messages_and_choices=[{"role": "user", "content": "test"}], + ) + ], + exceptions=[], + ) + ] + + await model.log(trajectories, split="train") + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["reward/mean"] == 0.7 + assert entry["reward/exception_rate"] == 0.0 + assert "train/reward" not in entry + assert entry["reward/custom_score"] == 1.0 + assert entry["reward/prefixed"] == 2.0 + + @pytest.mark.asyncio + async def test_train_logs_add_default_data_metrics_from_trajectory_groups( + self, tmp_path: Path + ): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + trajectories = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.8, + messages_and_choices=[{"role": "user", "content": "a"}], + ), + Trajectory( + reward=0.2, + messages_and_choices=[{"role": "user", "content": "b"}], + ), + ], + metadata={"scenario_id": "scenario-1"}, + ), + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.5, + messages_and_choices=[{"role": "user", "content": "c"}], + ) + ], + exceptions=[], + metadata={"scenario_id": "scenario-2"}, + ), + ] + + await model.log(trajectories, split="train", step=1) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + rows = [json.loads(line) for line in f if line.strip()] + + merged: dict[str, float] = {} + for row in rows: + merged.update(row) + + assert merged["data/step_num_scenarios"] == pytest.approx(2.0) + assert merged["data/step_num_trajectories"] == pytest.approx(3.0) + assert merged["data/step_num_groups_submitted"] == pytest.approx(2.0) + assert merged["data/step_num_groups_trainable"] == pytest.approx(1.0) + assert merged["data/cum/num_unique_scenarios"] == pytest.approx(2.0) + assert merged["train/num_groups_submitted"] == pytest.approx(2.0) + assert merged["train/num_groups_trainable"] == pytest.approx(1.0) + assert merged["train/num_trajectories"] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_costs_are_logged_in_hierarchical_taxonomy(self, tmp_path: Path): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + await model.log( + trajectories=None, + split="train", + step=1, + metrics={ + "costs/train/prefill": 0.2, + "costs/train/sample": 0.3, + }, + ) + await model.log( + trajectories=None, + split="train", + step=2, + metrics={ + "costs/train/prefill": 0.1, + }, + ) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + first = json.loads(f.readline()) + second = json.loads(f.readline()) + + assert first["costs/train/prefill"] == pytest.approx(0.2) + assert first["costs/train/sample"] == pytest.approx(0.3) + assert first["costs/train"] == pytest.approx(0.5) + assert first["costs/all"] == pytest.approx(0.5) + assert first["costs/cum/all"] == pytest.approx(0.5) + + assert second["costs/train/prefill"] == pytest.approx(0.1) + assert second["costs/cum/train/prefill"] == pytest.approx(0.3) + assert second["costs/cum/train"] == pytest.approx(0.6) + assert second["costs/cum/all"] == pytest.approx(0.6) + + @pytest.mark.asyncio + async def test_cost_cumulative_persists_across_model_recreation( + self, tmp_path: Path + ): + model_1 = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + await model_1.log( + trajectories=None, + split="train", + step=1, + metrics={"costs/train/prefill": 0.25}, + ) + + model_2 = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + await model_2.log( + trajectories=None, + split="train", + step=2, + metrics={"costs/train/prefill": 0.75}, + ) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + first = json.loads(f.readline()) + second = json.loads(f.readline()) + + assert first["costs/cum/train/prefill"] == pytest.approx(0.25) + assert second["costs/cum/train/prefill"] == pytest.approx(1.0) + assert second["costs/cum/all"] == pytest.approx(1.0) + + @pytest.mark.asyncio + async def test_metrics_builder_loads_resume_state_before_builder_use( + self, tmp_path: Path + ): + model_1 = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + model_1.metrics_builder().add_data(scenario_ids=["scenario-a"]) + await model_1.log(trajectories=None, split="train", step=1, metrics={}) + + model_2 = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + model_2.metrics_builder().add_data(scenario_ids=["scenario-b"]) + await model_2.log(trajectories=None, split="train", step=2, metrics={}) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + first = json.loads(f.readline()) + second = json.loads(f.readline()) + + assert first["data/cum/num_unique_scenarios"] == pytest.approx(1.0) + assert second["data/cum/num_unique_scenarios"] == pytest.approx(2.0) + + @pytest.mark.asyncio + async def test_direct_time_and_data_metrics_get_cumulative_variants( + self, tmp_path: Path + ): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + await model.log( + trajectories=None, + split="train", + step=1, + metrics={ + "time/step_actor_s": 1.5, + "data/step_actor_tokens": 10, + }, + ) + + history_path = tmp_path / "test/models/test/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["time/step_actor_s"] == pytest.approx(1.5) + assert entry["time/cum/actor_s"] == pytest.approx(1.5) + assert entry["data/step_actor_tokens"] == pytest.approx(10) + assert entry["data/cum/actor_tokens"] == pytest.approx(10) + + @pytest.mark.asyncio + async def test_log_without_new_builder_metrics_skips_extra_taxonomy_row( + self, tmp_path: Path + ): + model = Model( + name="test", + project="test", + base_path=str(tmp_path), + report_metrics=[], + ) + + model.metrics_builder().add_data(scenario_ids=["scenario-a"]) + await model.log( + trajectories=None, + split="train", + step=1, + metrics={ + "time/step_trainer_s": 2.0, + "data/step_trainer_tokens": 20.0, + }, + ) + await model.log( + trajectories=None, + split="train", + step=2, + metrics={"loss/train": 1.0}, + ) + + history_path = tmp_path / "test/models/test/history.jsonl" + rows = [json.loads(line) for line in history_path.open() if line.strip()] + + assert len(rows) == 2 + assert rows[0]["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) + assert rows[0]["data/cum/num_unique_scenarios"] == pytest.approx(1.0) + assert rows[1]["loss/train"] == pytest.approx(1.0) + assert "throughput/avg_trainer_tok_per_s" not in rows[1] + assert "data/cum/num_unique_scenarios" not in rows[1] + class TestWandbIntegration: """Test wandb integration logic (without mocking wandb itself).""" @@ -551,6 +898,95 @@ def test_should_log_wandb_logic_empty_list(self, tmp_path: Path): assert should_log is False +class TestLocalBackendAutomaticMetrics: + @pytest.mark.asyncio + async def test_train_logs_automatic_wall_time_and_gpu_cost( + self, tmp_path: Path + ) -> None: + backend = LocalBackend(gpu_cost_per_hour_usd=3.0) + + with patch("art.model.time.monotonic", side_effect=[100.0, 106.0, 111.0]): + model = TrainableModel( + name="test-model", + project="test-project", + base_model="Qwen/Qwen3-4B-Instruct-2507", + base_path=str(tmp_path), + report_metrics=[], + _internal_config={"trainer_gpu_ids": [0]}, + ) + model._backend = backend + + await model.log( + trajectories=None, + split="train", + step=1, + metrics={"loss/train": 1.0}, + ) + await model.log( + trajectories=None, + split="train", + step=2, + metrics={"loss/train": 0.5}, + ) + + history_path = tmp_path / "test-project/models/test-model/history.jsonl" + rows = [json.loads(line) for line in history_path.open() if line.strip()] + + first_gpu_cost = 6.0 * 3.0 / 3600.0 + second_gpu_cost = 5.0 * 3.0 / 3600.0 + + assert rows[0]["time/step_wall_s"] == pytest.approx(6.0) + assert rows[0]["costs/gpu"] == pytest.approx(first_gpu_cost) + assert rows[0]["costs/all"] == pytest.approx(first_gpu_cost) + assert rows[0]["costs/cum/gpu"] == pytest.approx(first_gpu_cost) + + assert rows[1]["time/step_wall_s"] == pytest.approx(5.0) + assert rows[1]["costs/gpu"] == pytest.approx(second_gpu_cost) + assert rows[1]["costs/cum/gpu"] == pytest.approx( + first_gpu_cost + second_gpu_cost + ) + assert rows[1]["costs/cum/all"] == pytest.approx( + first_gpu_cost + second_gpu_cost + ) + + @pytest.mark.asyncio + async def test_unknown_local_gpu_skips_cost_but_keeps_wall_time( + self, tmp_path: Path + ) -> None: + backend = LocalBackend() + + with patch("art.model.time.monotonic", side_effect=[50.0, 55.0]): + with patch("art.local.backend.torch.cuda.is_available", return_value=True): + with patch("art.local.backend.torch.cuda.device_count", return_value=1): + with patch( + "art.local.backend.torch.cuda.get_device_name", + return_value="NVIDIA A100-SXM4-80GB", + ): + model = TrainableModel( + name="test-model", + project="test-project", + base_model="Qwen/Qwen3-4B-Instruct-2507", + base_path=str(tmp_path), + report_metrics=[], + _internal_config={"trainer_gpu_ids": [0]}, + ) + model._backend = backend + await model.log( + trajectories=None, + split="train", + step=1, + metrics={"loss/train": 1.0}, + ) + + history_path = tmp_path / "test-project/models/test-model/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["time/step_wall_s"] == pytest.approx(5.0) + assert "costs/gpu" not in entry + assert "costs/all" not in entry + + class TestModelAttributes: """Test new Model attributes.""" @@ -593,9 +1029,21 @@ async def test_train_sft_aggregates_metrics(self, tmp_path: Path): async def mock_train_sft(*args, **kwargs): # Simulate 3 batches with different metrics - yield {"loss": 1.0, "learning_rate": 1e-4, "grad_norm": 0.5} - yield {"loss": 0.8, "learning_rate": 1e-4, "grad_norm": 0.4} - yield {"loss": 0.6, "learning_rate": 1e-4, "grad_norm": 0.3} + yield { + "loss/train": 1.0, + "loss/learning_rate": 1e-4, + "loss/grad_norm": 0.5, + } + yield { + "loss/train": 0.8, + "loss/learning_rate": 1e-4, + "loss/grad_norm": 0.4, + } + yield { + "loss/train": 0.6, + "loss/learning_rate": 1e-4, + "loss/grad_norm": 0.3, + } mock_backend._train_sft = mock_train_sft mock_backend._get_step = AsyncMock(return_value=1) # Step after training @@ -625,11 +1073,16 @@ async def mock_train_sft(*args, **kwargs): assert len(lines) == 1, f"Expected 1 log entry, got {len(lines)}" - # Verify metrics are aggregated (averaged) - entry = json.loads(lines[0]) - assert entry["step"] == 1 - assert entry["train/loss"] == pytest.approx(0.8) # (1.0 + 0.8 + 0.6) / 3 - assert entry["train/grad_norm"] == pytest.approx(0.4) # (0.5 + 0.4 + 0.3) / 3 + entries = [json.loads(line) for line in lines] + merged: dict[str, float] = {} + for entry in entries: + merged.update(entry) + + assert all(entry["step"] == 1 for entry in entries) + assert merged["loss/train"] == pytest.approx(0.8) # (1.0 + 0.8 + 0.6) / 3 + assert merged["loss/grad_norm"] == pytest.approx(0.4) # (0.5 + 0.4 + 0.3) / 3 + assert merged["time/step_trainer_s"] >= 0 + assert merged["time/cum/trainer_s"] >= 0 @pytest.mark.asyncio async def test_train_sft_single_step_increment(self, tmp_path: Path): @@ -667,7 +1120,7 @@ async def mock_train_sft(*args, **kwargs): df = pl.read_ndjson(str(history_path)) assert len(df) == 1, "Should have exactly 1 log entry" - assert df["step"][0] == 1, "Step should be 1 (single increment)" + assert set(df["step"].to_list()) == {1}, "Step should be 1 (single increment)" @pytest.mark.asyncio async def test_train_sft_no_metrics_when_empty(self, tmp_path: Path): @@ -698,3 +1151,92 @@ async def mock_train_sft(*args, **kwargs): assert not history_path.exists(), ( "No history.jsonl should be created for empty training" ) + + +class TestGradientStepMetrics: + @pytest.mark.asyncio + async def test_model_train_logs_gradient_step_count(self, tmp_path: Path): + model = TrainableModel( + name="test-train", + project="test-project", + base_model="gpt-4", + base_path=str(tmp_path), + report_metrics=[], + ) + + async def mock_train_model(*args, **kwargs): + for loss in (1.0, 0.8, 0.6): + yield { + "loss/train": loss, + TRAIN_GRADIENT_STEPS_KEY: 3.0, + } + + mock_backend = MagicMock() + mock_backend._train_model = mock_train_model + mock_backend._get_step = AsyncMock(return_value=1) + model._backend = mock_backend + + groups = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=1.0, + messages_and_choices=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ], + ) + ] + ) + ] + + await model.train(groups) + + history_path = tmp_path / "test-project/models/test-train/history.jsonl" + rows = [json.loads(line) for line in history_path.open() if line.strip()] + merged: dict[str, float] = {} + for row in rows: + merged.update(row) + + assert merged[TRAIN_GRADIENT_STEPS_KEY] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_local_backend_train_returns_gradient_step_count( + self, tmp_path: Path + ): + model = TrainableModel( + name="test-backend-train", + project="test-project", + base_model="gpt-4", + base_path=str(tmp_path), + report_metrics=[], + ) + backend = LocalBackend(path=str(tmp_path)) + + async def mock_train_model(*args, **kwargs): + for loss in (1.0, 0.8): + yield { + "loss/train": loss, + TRAIN_GRADIENT_STEPS_KEY: 2.0, + } + + backend._train_model = mock_train_model # type: ignore[method-assign] + backend._get_step = AsyncMock(return_value=1) # type: ignore[method-assign] + + groups = [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=1.0, + messages_and_choices=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ], + ) + ] + ) + ] + + result = await backend.train(model, groups, save_checkpoint=False) + + assert result.metrics[TRAIN_GRADIENT_STEPS_KEY] == pytest.approx(2.0) diff --git a/tests/unit/test_metric_routing.py b/tests/unit/test_metric_routing.py new file mode 100644 index 000000000..8f6cad928 --- /dev/null +++ b/tests/unit/test_metric_routing.py @@ -0,0 +1,122 @@ +import json +import os +from pathlib import Path +import types +from unittest.mock import MagicMock, patch + +from art import Model + + +class TestMetricRoutingBaseline: + def test_log_metrics_routes_known_sections_without_split_prefix( + self, tmp_path: Path + ) -> None: + model = Model( + name="test-model", + project="test-project", + base_path=str(tmp_path), + report_metrics=[], + ) + + model._log_metrics( + { + "reward/mean": 0.9, + "custom": 1.0, + "rewardish/value": 2.0, + }, + split="train", + step=7, + ) + + history_path = tmp_path / "test-project/models/test-model/history.jsonl" + with open(history_path) as f: + entry = json.loads(f.readline()) + + assert entry["reward/mean"] == 0.9 + assert entry["train/custom"] == 1.0 + assert entry["train/rewardish/value"] == 2.0 + assert entry["training_step"] == 7 + assert entry["time/wall_clock_sec"] >= 0 + + def test_get_wandb_run_registers_taxonomy_sections(self, tmp_path: Path) -> None: + fake_run = MagicMock() + fake_run._is_finished = False + + fake_wandb = types.SimpleNamespace() + fake_wandb.init = MagicMock(return_value=fake_run) + fake_wandb.define_metric = MagicMock() + fake_wandb.Settings = lambda **kwargs: kwargs + + with patch.dict(os.environ, {"WANDB_API_KEY": "test-key"}, clear=False): + with patch.dict("sys.modules", {"wandb": fake_wandb}): + model = Model( + name="test-model", + project="test-project", + base_path=str(tmp_path), + ) + run = model._get_wandb_run() + + assert run is fake_run + define_calls = [ + (call.args, call.kwargs) for call in fake_wandb.define_metric.call_args_list + ] + assert define_calls == [ + (("training_step",), {}), + (("time/wall_clock_sec",), {}), + (("reward/*",), {"step_metric": "training_step"}), + (("loss/*",), {"step_metric": "training_step"}), + (("throughput/*",), {"step_metric": "training_step"}), + (("costs/*",), {"step_metric": "training_step"}), + (("time/*",), {"step_metric": "training_step"}), + (("data/*",), {"step_metric": "training_step"}), + (("train/*",), {"step_metric": "training_step"}), + (("val/*",), {"step_metric": "training_step"}), + (("test/*",), {"step_metric": "training_step"}), + ] + + def test_log_metrics_defines_nested_cost_keys_with_training_step( + self, tmp_path: Path + ) -> None: + fake_run = MagicMock() + fake_run._is_finished = False + + fake_wandb = types.SimpleNamespace() + fake_wandb.init = MagicMock(return_value=fake_run) + fake_wandb.define_metric = MagicMock() + fake_wandb.Settings = lambda **kwargs: kwargs + + with patch.dict(os.environ, {"WANDB_API_KEY": "test-key"}, clear=False): + with patch.dict("sys.modules", {"wandb": fake_wandb}): + model = Model( + name="test-model", + project="test-project", + base_path=str(tmp_path), + report_metrics=["wandb"], + ) + model._log_metrics( + { + "costs/train/sample": 0.1, + "costs/cum/train/prefill": 0.2, + }, + split="train", + step=1, + ) + + define_calls = [ + (call.args, call.kwargs) for call in fake_wandb.define_metric.call_args_list + ] + assert ( + ("costs/train/sample",), + {"step_metric": "training_step"}, + ) in define_calls + assert ( + ("costs/cum/train/prefill",), + {"step_metric": "training_step"}, + ) in define_calls + fake_run.log.assert_called_once() + logged_metrics = fake_run.log.call_args.args[0] + assert logged_metrics["costs/train/sample"] == 0.1 + assert logged_metrics["costs/cum/train/prefill"] == 0.2 + assert logged_metrics["training_step"] == 1 + assert "time/wall_clock_sec" in logged_metrics + assert fake_run.log.call_args.kwargs == {} diff --git a/tests/unit/test_metrics_builder.py b/tests/unit/test_metrics_builder.py new file mode 100644 index 000000000..dfa24a113 --- /dev/null +++ b/tests/unit/test_metrics_builder.py @@ -0,0 +1,254 @@ +import asyncio + +import pytest + +from art.metrics import MetricsBuilder + + +class TestMetricsBuilder: + @pytest.mark.asyncio + async def test_rollup_correctness_across_depths(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_cost("train/llm_judge/general_judge", usd=0.08) + builder.add_cost("train/llm_judge/hallucination_judge", usd=0.04) + builder.add_cost("train/tinker_train", usd=1.20) + builder.add_cost("train/tinker_inference", usd=0.45) + builder.add_cost("eval/llm_judge/correctness", usd=0.06) + + metrics = await builder.flush() + + assert metrics["costs/train/llm_judge"] == pytest.approx(0.12) + assert metrics["costs/train"] == pytest.approx(1.77) + assert metrics["costs/eval"] == pytest.approx(0.06) + assert metrics["costs/all"] == pytest.approx(1.83) + assert metrics["costs/cum/train/llm_judge"] == pytest.approx(0.12) + assert metrics["costs/cum/train"] == pytest.approx(1.77) + assert metrics["costs/cum/all"] == pytest.approx(1.83) + + @pytest.mark.asyncio + async def test_cum_accumulates_for_hierarchical_sections(self) -> None: + builder = MetricsBuilder(cost_context="train") + + builder.add_user_timing(step_wall_s=1.5, step_actor_s=0.3) + builder.add_data( + step_num_scenarios=2, + step_actor_tokens=10, + scenario_ids=["a", "b"], + ) + first = await builder.flush() + + assert first["time/cum/wall_s"] == pytest.approx(1.5) + assert first["time/cum/actor_s"] == pytest.approx(0.3) + assert first["data/cum/num_scenarios"] == pytest.approx(2) + assert first["data/cum/actor_tokens"] == pytest.approx(10) + assert first["data/cum/num_unique_scenarios"] == 2 + + builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2) + builder.add_data( + step_num_scenarios=3, + step_actor_tokens=5, + scenario_ids=["b", "c"], + ) + second = await builder.flush() + + assert second["time/cum/wall_s"] == pytest.approx(2.0) + assert second["time/cum/actor_s"] == pytest.approx(0.5) + assert second["data/cum/num_scenarios"] == pytest.approx(5) + assert second["data/cum/actor_tokens"] == pytest.approx(15) + assert second["data/cum/num_unique_scenarios"] == 3 + + @pytest.mark.asyncio + async def test_helper_metrics_accumulate_within_a_single_step(self) -> None: + builder = MetricsBuilder(cost_context="train") + + builder.add_data(step_num_scenarios=2, step_actor_tokens=10) + builder.add_data(step_num_scenarios=3, step_actor_tokens=5) + builder.add_user_timing(step_wall_s=1.5, step_actor_s=0.3, step_eval_s=0.2) + builder.add_user_timing(step_wall_s=0.5, step_actor_s=0.2, step_eval_s=0.1) + builder.add_idle_times(step_trainer_idle_s=1.0, step_actor_idle_s=2.0) + builder.add_idle_times(step_trainer_idle_s=0.5, step_actor_idle_s=1.0) + + metrics = await builder.flush() + + assert metrics["data/step_num_scenarios"] == pytest.approx(5) + assert metrics["data/step_actor_tokens"] == pytest.approx(15) + assert metrics["time/step_wall_s"] == pytest.approx(2.0) + assert metrics["time/step_actor_s"] == pytest.approx(0.5) + assert metrics["time/step_eval_s"] == pytest.approx(0.3) + assert metrics["throughput/step_trainer_idle_s"] == pytest.approx(1.5) + assert metrics["throughput/step_actor_idle_s"] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_throughput_metrics_derive_from_time_and_token_cumulatives( + self, + ) -> None: + builder = MetricsBuilder(cost_context="train") + + builder.add_metric("time/step_trainer_s", 4.0) + builder.add_metric("data/step_trainer_tokens", 40.0) + builder.add_metric("time/step_actor_s", 2.0) + builder.add_metric("data/step_actor_tokens", 10.0) + builder.add_idle_times(step_trainer_idle_s=1.5, step_actor_idle_s=0.5) + + metrics = await builder.flush() + + assert metrics["throughput/cum/trainer_idle_s"] == pytest.approx(1.5) + assert metrics["throughput/cum/actor_idle_s"] == pytest.approx(0.5) + assert metrics["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) + assert metrics["throughput/avg_actor_tok_per_s"] == pytest.approx(5.0) + + @pytest.mark.asyncio + async def test_costs_all_generated_for_single_and_multiple_children(self) -> None: + single = MetricsBuilder(cost_context="train") + single.add_cost("train/gpu", usd=2.0) + one = await single.flush() + assert one["costs/all"] == pytest.approx(2.0) + + multi = MetricsBuilder(cost_context="train") + multi.add_cost("train/gpu", usd=2.0) + multi.add_cost("eval/llm_judge/correctness", usd=0.5) + two = await multi.flush() + assert two["costs/all"] == pytest.approx(2.5) + + def test_leaf_parent_conflicts_raise(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_cost("train", usd=1.0) + with pytest.raises(ValueError): + builder.add_cost("train/llm_judge", usd=0.1) + + other = MetricsBuilder(cost_context="train") + other.add_cost("train/llm_judge", usd=0.1) + with pytest.raises(ValueError): + other.add_cost("train", usd=1.0) + + @pytest.mark.asyncio + async def test_duplicate_leaf_writes_are_summed(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_cost("train/gpu", usd=1.25) + builder.add_cost("train/gpu", usd=0.75) + + metrics = await builder.flush() + + assert metrics["costs/train/gpu"] == pytest.approx(2.0) + assert metrics["costs/train"] == pytest.approx(2.0) + assert metrics["costs/all"] == pytest.approx(2.0) + + def test_cumulative_namespace_is_reserved(self) -> None: + builder = MetricsBuilder(cost_context="train") + with pytest.raises(ValueError): + builder.add_metric("costs/cum/train/llm_judge", 0.1) + + @pytest.mark.asyncio + async def test_sparse_steps_omit_rollup_for_missing_costs(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_cost("train/gpu", usd=1.0) + first = await builder.flush() + assert first["costs/cum/train"] == pytest.approx(1.0) + + second = await builder.flush() + assert not any(key.startswith("costs/") for key in second) + + builder.add_cost("train/gpu", usd=2.0) + third = await builder.flush() + assert third["costs/train"] == pytest.approx(2.0) + assert third["costs/cum/train"] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_state_dict_round_trip_preserves_cumulative_state(self) -> None: + before = MetricsBuilder(cost_context="train") + before.add_cost("train/gpu", usd=1.0) + await before.flush() + + state = before.state_dict() + after = MetricsBuilder(cost_context="train") + after.load_state_dict(state) + after.add_cost("train/gpu", usd=2.0) + + metrics = await after.flush() + assert metrics["costs/cum/train"] == pytest.approx(3.0) + assert metrics["costs/cum/all"] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_loaded_state_is_shared_with_other_cost_contexts(self) -> None: + before = MetricsBuilder(cost_context="train") + before.add_cost("train/gpu", usd=1.0) + await before.flush() + + after = MetricsBuilder(cost_context="train") + after.load_state_dict(before.state_dict()) + + eval_builder = after.for_cost_context("eval") + eval_builder.add_cost("eval/judge", usd=2.0) + + metrics = await eval_builder.flush() + assert metrics["costs/eval/judge"] == pytest.approx(2.0) + assert metrics["costs/cum/all"] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_add_response_cost_uses_registered_model_pricing(self) -> None: + builder = MetricsBuilder(cost_context="eval") + builder.register_model_pricing( + "anthropic/test-judge", + prompt_per_million=5.0, + completion_per_million=7.0, + ) + + cost = builder.add_response_cost( + "llm_judge/faithfulness", + { + "model": "anthropic/test-judge", + "usage": {"input_tokens": 40, "output_tokens": 60}, + }, + provider="anthropic", + model_name="anthropic/test-judge", + ) + + metrics = await builder.flush() + assert cost == pytest.approx(0.00062) + assert metrics["costs/eval/llm_judge/faithfulness"] == pytest.approx(0.00062) + + @pytest.mark.asyncio + async def test_unique_scenario_count_tracks_exact_ids(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_data(scenario_ids=["s1", "s2", "s3"]) + first = await builder.flush() + assert first["data/cum/num_unique_scenarios"] == 3 + + builder.add_data(scenario_ids=["s2", "s4"]) + second = await builder.flush() + assert second["data/cum/num_unique_scenarios"] == 4 + + @pytest.mark.asyncio + async def test_empty_flush_does_not_repeat_stale_derived_metrics(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.add_metric("time/step_trainer_s", 2.0) + builder.add_metric("data/step_trainer_tokens", 20.0) + builder.add_data(scenario_ids=["s1"]) + + first = await builder.flush() + assert first["throughput/avg_trainer_tok_per_s"] == pytest.approx(10.0) + assert first["data/cum/num_unique_scenarios"] == 1 + + second = await builder.flush() + assert second == {} + + @pytest.mark.asyncio + async def test_concurrent_add_cost_calls_do_not_lose_updates(self) -> None: + builder = MetricsBuilder(cost_context="train") + + async def worker() -> None: + for _ in range(25): + builder.add_cost("train/gpu", usd=0.1) + await asyncio.sleep(0) + + await asyncio.gather(*(worker() for _ in range(4))) + metrics = await builder.flush() + + assert metrics["costs/train/gpu"] == pytest.approx(10.0) + assert metrics["costs/all"] == pytest.approx(10.0) + + def test_contextvar_activate_and_get_active(self) -> None: + builder = MetricsBuilder(cost_context="eval") + token = builder.activate() + assert MetricsBuilder.get_active() is builder + token.var.reset(token) diff --git a/tests/unit/test_metrics_taxonomy.py b/tests/unit/test_metrics_taxonomy.py new file mode 100644 index 000000000..b2eaadc0f --- /dev/null +++ b/tests/unit/test_metrics_taxonomy.py @@ -0,0 +1,80 @@ +import pytest + +from art import Trajectory, TrajectoryGroup +from art.metrics_taxonomy import ( + TRAIN_GRADIENT_STEPS_KEY, + TrajectoryBatchSummary, + average_metric_samples, + build_training_summary_metrics, + summarize_trajectory_groups, +) + + +def test_average_metric_samples_handles_sparse_keys() -> None: + averaged = average_metric_samples( + [ + {"loss/train": 1.0, "loss/grad_norm": 0.5}, + {"loss/train": 0.5}, + {"loss/grad_norm": 1.0}, + ] + ) + + assert averaged["loss/train"] == pytest.approx(0.75) + assert averaged["loss/grad_norm"] == pytest.approx(0.75) + + +def test_build_training_summary_metrics_includes_data_and_train_sections() -> None: + summary = TrajectoryBatchSummary( + num_scenarios=2, + num_trajectories=5, + num_groups_submitted=2, + num_groups_trainable=1, + scenario_ids=["a", "b"], + ) + + metrics = build_training_summary_metrics( + summary, + include_trainable_groups=True, + ) + + assert metrics["data/step_num_scenarios"] == pytest.approx(2.0) + assert metrics["data/step_num_groups_trainable"] == pytest.approx(1.0) + assert metrics["train/num_groups_submitted"] == pytest.approx(2.0) + assert metrics["train/num_trajectories"] == pytest.approx(5.0) + + +def test_average_metric_samples_requires_invariant_gradient_step_count() -> None: + with pytest.raises(ValueError, match="must be invariant"): + average_metric_samples( + [ + {TRAIN_GRADIENT_STEPS_KEY: 2.0}, + {TRAIN_GRADIENT_STEPS_KEY: 3.0}, + ] + ) + + +def test_summarize_trajectory_groups_only_counts_explicit_scenario_id() -> None: + summary = summarize_trajectory_groups( + [ + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=1.0, + messages_and_choices=[{"role": "user", "content": "a"}], + ) + ], + metadata={"scenario_id": "scenario-1"}, + ), + TrajectoryGroup( + trajectories=[ + Trajectory( + reward=0.0, + messages_and_choices=[{"role": "user", "content": "b"}], + ) + ], + metadata={"scenario_scenario_id": "legacy-scenario"}, + ), + ] + ) + + assert summary.scenario_ids == ["scenario-1"] diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py new file mode 100644 index 000000000..80553e48b --- /dev/null +++ b/tests/unit/test_track_api_cost.py @@ -0,0 +1,554 @@ +import asyncio +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from art import Model, TrainableModel, Trajectory, TrajectoryGroup +from art.costs import compute_sample_costs, get_model_pricing +from art.metrics import MetricsBuilder, track_api_cost +from art.pipeline_trainer.trainer import PipelineTrainer + + +class _OpenAIUsage: + def __init__( + self, + prompt_tokens: int, + completion_tokens: int, + *, + cached_tokens: int = 0, + ) -> None: + self.prompt_tokens = prompt_tokens + self.completion_tokens = completion_tokens + self.prompt_tokens_details = type( + "PromptTokensDetails", + (), + {"cached_tokens": cached_tokens}, + )() + + +class _OpenAIResponse: + def __init__( + self, + prompt_tokens: int, + completion_tokens: int, + *, + cached_tokens: int = 0, + model: str | None = None, + ) -> None: + self.usage = _OpenAIUsage( + prompt_tokens, + completion_tokens, + cached_tokens=cached_tokens, + ) + self.model = model + + +class _AnthropicUsage: + def __init__( + self, + input_tokens: int, + output_tokens: int, + *, + cache_creation_input_tokens: int = 0, + cache_read_input_tokens: int = 0, + ) -> None: + self.input_tokens = input_tokens + self.output_tokens = output_tokens + self.cache_creation_input_tokens = cache_creation_input_tokens + self.cache_read_input_tokens = cache_read_input_tokens + + +class _AnthropicResponse: + def __init__( + self, + input_tokens: int, + output_tokens: int, + *, + cache_creation_input_tokens: int = 0, + cache_read_input_tokens: int = 0, + model: str | None = None, + ) -> None: + self.usage = _AnthropicUsage( + input_tokens, + output_tokens, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, + ) + self.model = model + + +class TestTrackApiCost: + @pytest.mark.asyncio + async def test_openai_cost_extraction_with_explicit_pricing(self) -> None: + builder = MetricsBuilder(cost_context="train") + + @track_api_cost( + source="llm_judge/correctness", + provider="openai", + model_name="openai/gpt-4.1", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=100, completion_tokens=50) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/train/llm_judge/correctness"] == pytest.approx(0.0002) + + @pytest.mark.asyncio + async def test_openai_cost_extraction_accounts_for_cached_tokens(self) -> None: + builder = MetricsBuilder(cost_context="train") + + @track_api_cost( + source="llm_judge/cached_openai", + provider="openai", + model_name="openai/gpt-4.1", + prompt_price_per_million=2.0, + completion_price_per_million=8.0, + cached_prompt_price_per_million=0.5, + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse( + prompt_tokens=2_000, + completion_tokens=100, + cached_tokens=1_500, + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/train/llm_judge/cached_openai"] == pytest.approx(0.00255) + + @pytest.mark.asyncio + async def test_anthropic_cost_extraction_uses_registered_model_pricing( + self, + ) -> None: + builder = MetricsBuilder(cost_context="train") + builder.register_model_pricing( + "anthropic/test-judge", + prompt_per_million=5.0, + completion_per_million=7.0, + ) + + @track_api_cost( + source="llm_judge/faithfulness", + provider="anthropic", + model_name="anthropic/test-judge", + ) + async def _judge() -> _AnthropicResponse: + return _AnthropicResponse(input_tokens=40, output_tokens=60) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/train/llm_judge/faithfulness"] == pytest.approx(0.00062) + + @pytest.mark.asyncio + async def test_anthropic_cost_extraction_accounts_for_cache_write_and_read( + self, + ) -> None: + builder = MetricsBuilder(cost_context="eval") + builder.register_model_pricing( + "anthropic/claude-sonnet-4-6", + prompt_per_million=3.0, + completion_per_million=15.0, + cache_creation_per_million=3.75, + cache_read_per_million=0.30, + ) + + @track_api_cost( + source="llm_judge/anthropic_cache", + provider="anthropic", + model_name="anthropic/claude-sonnet-4-6", + ) + async def _judge() -> _AnthropicResponse: + return _AnthropicResponse( + input_tokens=100, + output_tokens=50, + cache_creation_input_tokens=1_000, + cache_read_input_tokens=500, + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/eval/llm_judge/anthropic_cache"] == pytest.approx(0.00495) + + @pytest.mark.asyncio + async def test_explicit_model_name_uses_global_pricing( + self, + ) -> None: + builder = MetricsBuilder(cost_context="train") + pricing = get_model_pricing("openai/gpt-oss-20b") + assert pricing is not None + + @track_api_cost( + source="llm_judge/global_pricing", + provider="openai", + model_name="openai/gpt-oss-20b", + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse( + prompt_tokens=1_000, + completion_tokens=2_000, + model="gpt-oss-20b", + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + expected = compute_sample_costs( + prompt_tokens=1_000, + completion_tokens=2_000, + cost_context="train", + pricing=pricing, + ) + assert metrics["costs/train/llm_judge/global_pricing"] == pytest.approx( + expected["costs/train/prefill"] + expected["costs/train/sample"] + ) + + @pytest.mark.asyncio + async def test_explicit_model_name_uses_registered_pricing( + self, + ) -> None: + builder = MetricsBuilder(cost_context="eval") + builder.register_model_pricing( + "anthropic/test-judge", + prompt_per_million=1.5, + completion_per_million=2.5, + ) + + @track_api_cost( + source="llm_judge/provider_resolution", + provider="anthropic", + model_name="anthropic/test-judge", + ) + async def _judge() -> _AnthropicResponse: + return _AnthropicResponse( + input_tokens=400, + output_tokens=600, + model="test-judge", + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/eval/llm_judge/provider_resolution"] == pytest.approx( + 0.0021 + ) + + @pytest.mark.asyncio + async def test_explicit_model_name_does_not_depend_on_response_model(self) -> None: + builder = MetricsBuilder(cost_context="train") + + @track_api_cost( + source="llm_judge/snapshot", + provider="openai", + model_name="openai/gpt-4.1", + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse( + prompt_tokens=1_000, + completion_tokens=100, + cached_tokens=800, + model="gpt-4.1-2025-04-14", + ) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + expected = ((200 * 2.0) + (800 * 0.5) + (100 * 8.0)) / 1_000_000 + assert metrics["costs/train/llm_judge/snapshot"] == pytest.approx(expected) + + @pytest.mark.asyncio + async def test_decorator_fails_fast_without_model_aware_pricing(self) -> None: + builder = MetricsBuilder(cost_context="train") + + @track_api_cost( + source="llm_judge/missing_pricing", + provider="openai", + model_name="openai/missing-pricing-model", + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=10, completion_tokens=20) + + token = builder.activate() + try: + with pytest.raises(ValueError, match="No pricing configured"): + await _judge() + finally: + token.var.reset(token) + + @pytest.mark.asyncio + async def test_custom_extractor_takes_precedence(self) -> None: + builder = MetricsBuilder(cost_context="train") + builder.register_cost_extractor("openai", lambda _response: 0.75) + + @track_api_cost( + source="llm_judge/custom", + provider="openai", + model_name="openai/gpt-4.1", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=1, completion_tokens=1) + + token = builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/train/llm_judge/custom"] == pytest.approx(0.75) + + @pytest.mark.asyncio + async def test_decorator_noops_without_active_builder(self) -> None: + @track_api_cost( + source="llm_judge/no_context", + provider="openai", + model_name="openai/gpt-4.1", + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=10, completion_tokens=20) + + result = await _judge() + assert isinstance(result, _OpenAIResponse) + + @pytest.mark.asyncio + async def test_for_cost_context_routes_to_eval_and_shares_state(self) -> None: + builder = MetricsBuilder(cost_context="train") + eval_builder = builder.for_cost_context("eval") + + @track_api_cost( + source="llm_judge/correctness", + provider="openai", + model_name="openai/gpt-4.1", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=100, completion_tokens=50) + + token = eval_builder.activate() + try: + await _judge() + finally: + token.var.reset(token) + + metrics = await builder.flush() + assert metrics["costs/eval/llm_judge/correctness"] == pytest.approx(0.0002) + + +class TestTrackApiCostIntegration: + @pytest.mark.asyncio + async def test_model_log_emits_train_and_eval_costs(self, tmp_path: Path) -> None: + model = Model( + name="metrics-cost-test", + project="metrics-cost-test", + base_path=str(tmp_path), + report_metrics=[], + ) + + @track_api_cost( + source="llm_judge/correctness", + provider="openai", + model_name="openai/gpt-4.1", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _train_judge() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=100, completion_tokens=50) + + @track_api_cost( + source="llm_judge/factuality", + provider="anthropic", + model_name="anthropic/claude-sonnet-4-6", + prompt_price_per_million=3.0, + completion_price_per_million=4.0, + ) + async def _eval_judge() -> _AnthropicResponse: + return _AnthropicResponse(input_tokens=40, output_tokens=10) + + train_token = model.activate_metrics_context("train") + try: + await _train_judge() + finally: + train_token.var.reset(train_token) + + await model.log(trajectories=None, split="train", step=1, metrics={}) + + eval_token = model.activate_metrics_context("eval") + try: + await _eval_judge() + finally: + eval_token.var.reset(eval_token) + + await model.log(trajectories=None, split="val", step=2, metrics={}) + + history_path = ( + tmp_path + / "metrics-cost-test" + / "models" + / "metrics-cost-test" + / "history.jsonl" + ) + with open(history_path) as f: + first = json.loads(f.readline()) + second = json.loads(f.readline()) + + assert first["costs/train/llm_judge/correctness"] == pytest.approx(0.0002) + assert second["costs/eval/llm_judge/factuality"] == pytest.approx(0.00016) + assert second["costs/cum/all"] == pytest.approx(0.00036) + + @pytest.mark.asyncio + async def test_pipeline_trainer_activates_train_context_for_rollouts( + self, tmp_path: Path + ) -> None: + model = TrainableModel( + name="pipeline-context-test", + project="pipeline-context-test", + base_model="test-model", + base_path=str(tmp_path), + report_metrics=[], + ) + backend = MagicMock() + observed_contexts: list[str] = [] + + async def rollout_fn( + _model: TrainableModel, + _scenario: dict, + _config: dict, + ) -> TrajectoryGroup: + observed_contexts.append(MetricsBuilder.get_active().cost_context) + return TrajectoryGroup( + [ + Trajectory( + reward=1.0, + messages_and_choices=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ], + ) + ] + ) + + trainer = PipelineTrainer( + model=model, + backend=backend, + rollout_fn=rollout_fn, + scenarios=[{"metadata": {"scenario_id": "s1"}}], + config={}, + num_rollout_workers=1, + min_batch_size=1, + max_batch_size=1, + eval_fn=None, + ) + trainer._output_queue = asyncio.Queue() + + await trainer._rollout_worker(worker_id=0) + + assert observed_contexts == ["train"] + + @pytest.mark.asyncio + async def test_pipeline_trainer_activates_eval_context_for_eval_fn( + self, tmp_path: Path + ) -> None: + model = TrainableModel( + name="pipeline-eval-context-test", + project="pipeline-eval-context-test", + base_model="test-model", + base_path=str(tmp_path), + report_metrics=[], + ) + backend = MagicMock() + observed_contexts: list[str] = [] + + @track_api_cost( + source="llm_judge/correctness", + provider="openai", + model_name="openai/gpt-4.1", + prompt_price_per_million=1.0, + completion_price_per_million=2.0, + ) + async def _judge_call() -> _OpenAIResponse: + return _OpenAIResponse(prompt_tokens=100, completion_tokens=50) + + async def eval_fn( + _model: TrainableModel, + _step: int, + _config: dict, + ) -> list[Trajectory]: + observed_contexts.append(MetricsBuilder.get_active().cost_context) + await _judge_call() + return [ + Trajectory( + reward=1.0, + messages_and_choices=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ], + ) + ] + + trainer = PipelineTrainer( + model=model, + backend=backend, + rollout_fn=lambda *_args, **_kwargs: asyncio.sleep(0), + scenarios=[], + config={}, + num_rollout_workers=1, + min_batch_size=1, + max_batch_size=1, + eval_fn=eval_fn, + ) + + await trainer._run_eval(step=1) + + assert observed_contexts == ["eval"] + + history_path = ( + tmp_path + / "pipeline-eval-context-test" + / "models" + / "pipeline-eval-context-test" + / "history.jsonl" + ) + with open(history_path) as f: + rows = [json.loads(line) for line in f if line.strip()] + + assert any("costs/eval/llm_judge/correctness" in row for row in rows) + assert any("time/step_eval_s" in row for row in rows) diff --git a/tests/unit/test_unsloth_metrics.py b/tests/unit/test_unsloth_metrics.py new file mode 100644 index 000000000..fdb91b0c4 --- /dev/null +++ b/tests/unit/test_unsloth_metrics.py @@ -0,0 +1,25 @@ +import asyncio +from collections import defaultdict + +from art.unsloth.train import get_log_fn + + +class _DummyTrainer: + def __init__(self) -> None: + self._metrics = {"train": defaultdict(list)} + + +def test_get_log_fn_routes_eval_metrics_to_val_namespace() -> None: + trainer = _DummyTrainer() + trainer._metrics["train"]["loss/train"].append(1.5) + trainer._metrics["train"]["loss/entropy"].append(0.2) + results_queue: asyncio.Queue[dict[str, float]] = asyncio.Queue() + + log = get_log_fn(trainer, results_queue) + log({"eval_loss": 1.0, "eval_runtime": 2.0}) + + assert results_queue.get_nowait() == { + "val/loss/train": 1.0, + "val/loss/entropy": 0.2, + "val/runtime": 2.0, + }