From 7d18efe79111e3d4b83a8903ada39b98a6be65da Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Sun, 3 May 2026 21:48:14 -0400 Subject: [PATCH 1/8] Stamp turn_taking MetricScore outputs with a version string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an optional `version` field to MetricScore and wire turn_taking to populate it from a `version = "v0.1"` class variable at every output site (main score, missed-turn early return, sub-metrics). This lets us tell, across partial metric reruns, which computation logic produced a given row — bump the class var when the algorithm changes. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/eva/metrics/experience/turn_taking.py | 5 +++++ src/eva/models/results.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/src/eva/metrics/experience/turn_taking.py b/src/eva/metrics/experience/turn_taking.py index 08034584..2c09c877 100644 --- a/src/eva/metrics/experience/turn_taking.py +++ b/src/eva/metrics/experience/turn_taking.py @@ -58,6 +58,7 @@ class TurnTakingMetric(CodeMetric): description = "Turn-taking evaluation based on per-turn latency and interruption behavior" category = "experience" pass_at_k_threshold = 0.8 + version = "v0.1" # --- Latency curve (piecewise linear). 0 outside [LATENCY_HARD_EARLY_MS, LATENCY_HARD_LATE_MS]. --- # Ramp up 0 → 1 from LATENCY_HARD_EARLY_MS to LATENCY_SWEET_SPOT_LOW_MS. @@ -324,6 +325,7 @@ def _wrap(key: str, value: float, normalized: bool) -> MetricScore: name=f"{cls.name}.{key}", score=value, normalized_score=value if normalized else None, + version=cls.version, ) # --- Latency --- @@ -380,6 +382,7 @@ def _pct(p: float) -> float: name=f"{cls.name}.agent_interruption.num_interruptions", score=float(sum(n_segs_list)) if n_segs_list else None, normalized_score=None, + version=cls.version, ) if overlap_ms_list: sub["agent_interruption.mean_overlap_ms"] = _wrap( @@ -481,6 +484,7 @@ async def compute(self, context: MetricContext) -> MetricScore: score=0.0, normalized_score=0.0, details=details, + version=self.version, ) score = 0.0 if missed_turn else round(statistics.mean(per_turn_score.values()), 4) @@ -492,6 +496,7 @@ async def compute(self, context: MetricContext) -> MetricScore: normalized_score=score, details=details, sub_metrics=sub_metrics, + version=self.version, ) except Exception as e: diff --git a/src/eva/models/results.py b/src/eva/models/results.py index 993184db..ebfe9ad3 100644 --- a/src/eva/models/results.py +++ b/src/eva/models/results.py @@ -94,6 +94,11 @@ class MetricScore(BaseModel): False, description="True when the metric had no applicable data to score (distinct from errored)", ) + version: str | None = Field( + None, + description="Metric implementation version (set by the metric class) for tracking which " + "computation logic produced this score across partial reruns", + ) sub_metrics: dict[str, "MetricScore"] | None = Field( None, description="Optional sub-metric breakdowns, aggregated generically by the runner" ) From 006cc8ce527338f6dba6a4e9c6162a7807e078dd Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Tue, 12 May 2026 13:17:27 -0400 Subject: [PATCH 2/8] Auto-stamp version + prompt_hash on every MetricScore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generalize the turn_taking-only version stamp to all metrics, and add an automatic per-judge prompt_hash so prompt edits are detectable even without a manual version bump. Stamping happens centrally via a Pydantic model_validator that reads two contextvars set by MetricsRunner before each metric.compute() call — metric authors only declare `version = "v0.1"` on the class and the rest is automatic at every MetricScore call site (no per-site `version=self.version` plumbing). The contextvar approach is per-asyncio-task, so concurrent metrics in the same record don't bleed values into each other. On partial reruns, metrics that aren't recomputed keep whatever version/prompt_hash was on disk — the validator only fills when the field is unset, so deserialized historical rows are preserved. prompt_hash is the sha256[:12] of the *unrendered* template (so per- record variable substitutions don't change the hash). PromptManager gains `get_template(path)` to expose the raw YAML template; BaseMetric. get_judge_prompt() pushes the hash into the contextvar each call. Drift test (tests/unit/metrics/test_metric_signatures.py) compares each concrete metric class's (version, source_hash, prompt_hash) against tests/fixtures/metric_signatures.json. Authors run `python scripts/regen_metric_signatures.py` to refresh the fixture after a deliberate version bump or prompt edit. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/regen_metric_signatures.py | 30 +++++ .../metrics/accuracy/agent_speech_fidelity.py | 1 + .../accuracy/agent_speech_fidelity_s2s.py | 1 + src/eva/metrics/accuracy/faithfulness.py | 1 + src/eva/metrics/accuracy/task_completion.py | 1 + src/eva/metrics/base.py | 11 +- .../diagnostic/authentication_success.py | 1 + .../conversation_correctly_finished.py | 1 + src/eva/metrics/diagnostic/response_speed.py | 1 + src/eva/metrics/diagnostic/speakability.py | 1 + src/eva/metrics/diagnostic/stt_wer.py | 1 + .../metrics/diagnostic/tool_call_validity.py | 1 + .../transcription_accuracy_key_entities.py | 1 + src/eva/metrics/experience/conciseness.py | 1 + .../experience/conversation_progression.py | 1 + src/eva/metrics/experience/turn_taking.py | 4 - src/eva/metrics/runner.py | 4 + src/eva/metrics/signatures.py | 78 +++++++++++++ .../validation/conversation_valid_end.py | 1 + .../validation/user_behavioral_fidelity.py | 1 + .../validation/user_speech_fidelity.py | 1 + src/eva/models/results.py | 19 +++- src/eva/models/versioning.py | 24 ++++ src/eva/utils/prompt_manager.py | 18 +++ tests/fixtures/metric_signatures.json | 104 ++++++++++++++++++ tests/unit/metrics/test_metric_signatures.py | 83 ++++++++++++++ tests/unit/metrics/test_runner.py | 90 +++++++++++++++ 27 files changed, 475 insertions(+), 6 deletions(-) create mode 100644 scripts/regen_metric_signatures.py create mode 100644 src/eva/metrics/signatures.py create mode 100644 src/eva/models/versioning.py create mode 100644 tests/fixtures/metric_signatures.json create mode 100644 tests/unit/metrics/test_metric_signatures.py diff --git a/scripts/regen_metric_signatures.py b/scripts/regen_metric_signatures.py new file mode 100644 index 00000000..9c27e69e --- /dev/null +++ b/scripts/regen_metric_signatures.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +"""Regenerate tests/fixtures/metric_signatures.json. + +Run this after intentionally changing a metric's logic and bumping its +`version` class attribute (or after editing its judge prompt template). +The drift test (tests/unit/metrics/test_metric_signatures.py) compares +the current state against this fixture and fails on any unintended drift. + +Usage: + python scripts/regen_metric_signatures.py +""" + +import json +from pathlib import Path + +from eva.metrics.signatures import compute_all_metric_signatures + +REPO_ROOT = Path(__file__).resolve().parent.parent +FIXTURE_PATH = REPO_ROOT / "tests" / "fixtures" / "metric_signatures.json" + + +def main() -> None: + signatures = compute_all_metric_signatures() + FIXTURE_PATH.parent.mkdir(parents=True, exist_ok=True) + FIXTURE_PATH.write_text(json.dumps(signatures, indent=2, sort_keys=True) + "\n") + print(f"Wrote {len(signatures)} metric signatures to {FIXTURE_PATH.relative_to(REPO_ROOT)}") + + +if __name__ == "__main__": + main() diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity.py b/src/eva/metrics/accuracy/agent_speech_fidelity.py index a659f7c4..d84179f7 100644 --- a/src/eva/metrics/accuracy/agent_speech_fidelity.py +++ b/src/eva/metrics/accuracy/agent_speech_fidelity.py @@ -14,6 +14,7 @@ class AgentSpeechFidelityMetric(SpeechFidelityBaseMetric): """ name = "agent_speech_fidelity" + version = "v0.1" description = "Audio-based evaluation of agent speech fidelity to the intended text" category = "accuracy" role = "assistant" diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py index c6d43bb6..2a9706cf 100644 --- a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py +++ b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py @@ -25,6 +25,7 @@ class AgentSpeechFidelityS2SMetric(SpeechFidelityBaseMetric): """ name = "agent_speech_fidelity" + version = "v0.1" description = "Audio-based evaluation of agent entity fidelity for S2S models" category = "accuracy" role = "assistant" diff --git a/src/eva/metrics/accuracy/faithfulness.py b/src/eva/metrics/accuracy/faithfulness.py index 3a85d29e..e6743792 100644 --- a/src/eva/metrics/accuracy/faithfulness.py +++ b/src/eva/metrics/accuracy/faithfulness.py @@ -54,6 +54,7 @@ class FaithfulnessJudgeMetric(ConversationTextJudgeMetric): """ name = "faithfulness" + version = "v0.1" description = ( "LLM judge evaluation of whether the assistant remains faithful to information, policies, and instructions" ) diff --git a/src/eva/metrics/accuracy/task_completion.py b/src/eva/metrics/accuracy/task_completion.py index f1cbe8d0..2ca780b8 100644 --- a/src/eva/metrics/accuracy/task_completion.py +++ b/src/eva/metrics/accuracy/task_completion.py @@ -35,6 +35,7 @@ class TaskCompletion(BaseMetric): """ name = "task_completion" + version = "v0.1" description = "Binary task completion via scenario DB state hash comparison" category = "accuracy" metric_type = MetricType.CODE diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py index 45a4a501..f05563e5 100644 --- a/src/eva/metrics/base.py +++ b/src/eva/metrics/base.py @@ -24,6 +24,7 @@ ) from eva.models.config import PipelineType from eva.models.results import MetricScore +from eva.models.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template from eva.utils.llm_client import LLMClient from eva.utils.logging import get_logger from eva.utils.prompt_manager import get_prompt_manager @@ -163,6 +164,9 @@ class BaseMetric(ABC): pass_at_k_threshold: float = 0.5 # Normalized score threshold for pass@k pass/fail exclude_from_pass_at_k: bool = False # Set True for metrics not suitable for pass@k supported_pipeline_types: frozenset[PipelineType] = frozenset(PipelineType) # Pipeline types this metric supports + # Bump on intentional logic changes; MetricsRunner stamps this onto every MetricScore + # produced by compute(). Required on all concrete subclasses — drift test enforces. + version: str | None = None # Direction of the displayed value (normalized_score if present, else score). # Override to False for lower-is-better parent metrics (e.g. latency). Sub-metric # direction is derived from the key suffix (see eva.metrics.utils.direction_for_sub_metric). @@ -179,8 +183,13 @@ def __init__(self, config: dict[str, Any] | None = None): self.prompt_manager = get_prompt_manager() def get_judge_prompt(self, prompt_key: str = "user_prompt", **variables) -> str: - """Get judge prompt using PromptManager.""" + """Get judge prompt using PromptManager. + + Stamps the unrendered template's sha256[:12] into the prompt-hash contextvar so + any MetricScore built afterwards in the same compute() picks it up automatically. + """ prompt_path = f"judge.{self.name}.{prompt_key}" + _CURRENT_PROMPT_HASH.set(hash_prompt_template(self.prompt_manager.get_template(prompt_path))) return self.prompt_manager.get_prompt(prompt_path, **variables) @abstractmethod diff --git a/src/eva/metrics/diagnostic/authentication_success.py b/src/eva/metrics/diagnostic/authentication_success.py index 80f93905..0662231b 100644 --- a/src/eva/metrics/diagnostic/authentication_success.py +++ b/src/eva/metrics/diagnostic/authentication_success.py @@ -45,6 +45,7 @@ class AuthenticationSuccessMetric(CodeMetric): """ name = "authentication_success" + version = "v0.1" description = "Checks if session state in final DB is a superset of expected session" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/conversation_correctly_finished.py b/src/eva/metrics/diagnostic/conversation_correctly_finished.py index 6ce5cc49..c38adb7d 100644 --- a/src/eva/metrics/diagnostic/conversation_correctly_finished.py +++ b/src/eva/metrics/diagnostic/conversation_correctly_finished.py @@ -11,6 +11,7 @@ class ConversationCorrectlyFinishedMetric(CodeMetric): """0.0 when the agent timed out on the user's final turn; 1.0 otherwise.""" name = "conversation_correctly_finished" + version = "v0.1" description = "Diagnostic metric: 0.0 when agent failed to respond to the user's final turn" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index a7724b07..56571e2c 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -60,6 +60,7 @@ class ResponseSpeedMetric(CodeMetric): description = "Diagnostic metric: latency between user utterance end and assistant response start" exclude_from_pass_at_k = True higher_is_better = False # Score is latency in seconds — lower is better. + version = "v0.1" async def compute(self, context: MetricContext) -> MetricScore: try: diff --git a/src/eva/metrics/diagnostic/speakability.py b/src/eva/metrics/diagnostic/speakability.py index bc9d8a45..f3e9c3b4 100644 --- a/src/eva/metrics/diagnostic/speakability.py +++ b/src/eva/metrics/diagnostic/speakability.py @@ -30,6 +30,7 @@ class SpeakabilityJudgeMetric(PerTurnConversationJudgeMetric): """ name = "speakability" + version = "v0.1" description = "Debug metric: LLM judge evaluation of text voice-friendliness per turn" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/stt_wer.py b/src/eva/metrics/diagnostic/stt_wer.py index 52ad5c2b..5c52e50c 100644 --- a/src/eva/metrics/diagnostic/stt_wer.py +++ b/src/eva/metrics/diagnostic/stt_wer.py @@ -72,6 +72,7 @@ class STTWERMetric(CodeMetric): """ name = "stt_wer" + version = "v0.1" description = "Debug metric: Speech-to-Text transcription accuracy using Word Error Rate" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/tool_call_validity.py b/src/eva/metrics/diagnostic/tool_call_validity.py index 12b4f7d8..01cfde95 100644 --- a/src/eva/metrics/diagnostic/tool_call_validity.py +++ b/src/eva/metrics/diagnostic/tool_call_validity.py @@ -39,6 +39,7 @@ class ToolCallValidity(CodeMetric): """ name = "tool_call_validity" + version = "v0.1" description = "Debug metric: fraction of tool calls with correctly formatted parameters" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py index ecdb79bf..4990b579 100644 --- a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py +++ b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py @@ -48,6 +48,7 @@ class TranscriptionAccuracyKeyEntitiesMetric(TextJudgeMetric): """ name = "transcription_accuracy_key_entities" + version = "v0.1" description = "Debug metric: LLM judge evaluation of STT key entity transcription accuracy for entire conversation" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/experience/conciseness.py b/src/eva/metrics/experience/conciseness.py index 3f77890e..2b4bc196 100644 --- a/src/eva/metrics/experience/conciseness.py +++ b/src/eva/metrics/experience/conciseness.py @@ -28,6 +28,7 @@ class ConcisenessJudgeMetric(PerTurnConversationJudgeMetric): """ name = "conciseness" + version = "v0.1" description = "LLM judge evaluation of assistant response conciseness" category = "experience" rating_scale = (1, 3) diff --git a/src/eva/metrics/experience/conversation_progression.py b/src/eva/metrics/experience/conversation_progression.py index 595101e4..f5abfa80 100644 --- a/src/eva/metrics/experience/conversation_progression.py +++ b/src/eva/metrics/experience/conversation_progression.py @@ -32,6 +32,7 @@ class ConversationProgressionJudgeMetric(ConversationTextJudgeMetric): """ name = "conversation_progression" + version = "v0.1" description = "LLM judge evaluation of whether the assistant moved the conversation forward productively" category = "experience" rating_scale = (1, 3) diff --git a/src/eva/metrics/experience/turn_taking.py b/src/eva/metrics/experience/turn_taking.py index 2c09c877..7e3da258 100644 --- a/src/eva/metrics/experience/turn_taking.py +++ b/src/eva/metrics/experience/turn_taking.py @@ -325,7 +325,6 @@ def _wrap(key: str, value: float, normalized: bool) -> MetricScore: name=f"{cls.name}.{key}", score=value, normalized_score=value if normalized else None, - version=cls.version, ) # --- Latency --- @@ -382,7 +381,6 @@ def _pct(p: float) -> float: name=f"{cls.name}.agent_interruption.num_interruptions", score=float(sum(n_segs_list)) if n_segs_list else None, normalized_score=None, - version=cls.version, ) if overlap_ms_list: sub["agent_interruption.mean_overlap_ms"] = _wrap( @@ -484,7 +482,6 @@ async def compute(self, context: MetricContext) -> MetricScore: score=0.0, normalized_score=0.0, details=details, - version=self.version, ) score = 0.0 if missed_turn else round(statistics.mean(per_turn_score.values()), 4) @@ -496,7 +493,6 @@ async def compute(self, context: MetricContext) -> MetricScore: normalized_score=score, details=details, sub_metrics=sub_metrics, - version=self.version, ) except Exception as e: diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index f7d945c5..32a5b26b 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -19,6 +19,7 @@ from eva.models.config import PipelineType, get_pipeline_type from eva.models.record import EvaluationRecord from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics +from eva.models.versioning import _CURRENT_METRIC_VERSION from eva.utils.hash_utils import get_dict_hash from eva.utils.logging import get_logger from eva.utils.pass_at_k import ( @@ -448,6 +449,9 @@ async def _run_record(self, record_id: str, record_dir: Path) -> RecordMetrics: # Create tasks for all metrics async def compute_metric(metric: BaseMetric) -> tuple[str, MetricScore]: """Compute a single metric and handle errors.""" + # Each gather() task gets its own contextvar snapshot, so this set is + # isolated from sibling/parent tasks — no reset needed. + _CURRENT_METRIC_VERSION.set(metric.version) try: logger.info(f"[{record_id}] Starting metric: {metric.name}") score = await metric.compute(context) diff --git a/src/eva/metrics/signatures.py b/src/eva/metrics/signatures.py new file mode 100644 index 00000000..1b5e9e98 --- /dev/null +++ b/src/eva/metrics/signatures.py @@ -0,0 +1,78 @@ +"""Compute drift signatures for metric classes. + +A metric's "signature" captures everything we want to detect changes to: + - `version`: the manually-bumped string on the class + - `source_hash`: sha256[:12] of `inspect.getsource(cls)` (class body) + - `prompt_hash`: sha256[:12] of the unrendered judge prompt template, or + None for non-judge metrics + +The drift test compares the current signatures against a checked-in fixture +and fails if anything changed without an explicit version bump + fixture regen. +""" + +import hashlib +import inspect + +# Importing the metric subpackages forces all concrete metric classes to be +# registered as BaseMetric subclasses, so walking __subclasses__ finds them. +import eva.metrics.accuracy # noqa: F401 +import eva.metrics.diagnostic # noqa: F401 +import eva.metrics.experience # noqa: F401 +import eva.metrics.validation # noqa: F401 +from eva.metrics.base import AudioJudgeMetric, BaseMetric, TextJudgeMetric +from eva.models.versioning import hash_prompt_template +from eva.utils.prompt_manager import get_prompt_manager + + +def _all_concrete_versioned_metric_classes() -> dict[str, type[BaseMetric]]: + """Walk BaseMetric subclasses; return concrete classes that set a version. + + Keyed on class qualname so co-named classes (e.g., the cascade vs S2S + variants of `agent_speech_fidelity`) get distinct entries. + """ + result: dict[str, type[BaseMetric]] = {} + + def walk(cls: type) -> None: + for sub in cls.__subclasses__(): + walk(sub) + if inspect.isabstract(sub): + continue + # `version` is None on BaseMetric; only concrete classes that + # deliberately set it are participating. + if getattr(sub, "version", None) is None: + continue + result[sub.__qualname__] = sub + + walk(BaseMetric) + return result + + +def _source_hash(cls: type) -> str: + """sha256[:12] of the class body source.""" + return hashlib.sha256(inspect.getsource(cls).encode("utf-8")).hexdigest()[:12] + + +def _prompt_hash_for_metric(cls: type[BaseMetric]) -> str | None: + """Return the prompt template hash for judge metrics, or None. + + All judge metrics in this codebase use `judge.{name}.user_prompt`. + A judge metric without a corresponding template raises KeyError — + that's a configuration bug we want surfaced. + """ + if not issubclass(cls, TextJudgeMetric | AudioJudgeMetric): + return None + template = get_prompt_manager().get_template(f"judge.{cls.name}.user_prompt") + return hash_prompt_template(template) + + +def compute_all_metric_signatures() -> dict[str, dict[str, str | None]]: + """Return {class_qualname: {version, source_hash, prompt_hash}} for every concrete metric.""" + out: dict[str, dict[str, str | None]] = {} + for qualname, cls in _all_concrete_versioned_metric_classes().items(): + out[qualname] = { + "name": cls.name, + "version": cls.version, + "source_hash": _source_hash(cls), + "prompt_hash": _prompt_hash_for_metric(cls), + } + return out diff --git a/src/eva/metrics/validation/conversation_valid_end.py b/src/eva/metrics/validation/conversation_valid_end.py index 61831f6f..f6a6ebc5 100644 --- a/src/eva/metrics/validation/conversation_valid_end.py +++ b/src/eva/metrics/validation/conversation_valid_end.py @@ -14,6 +14,7 @@ class ConversationValidEndMetric(CodeMetric): """Binary score: 1.0 when the conversation ended on goodbye OR agent-timeout-on-user-turn; 0.0 otherwise.""" name = "conversation_valid_end" + version = "v0.1" description = "Validation metric: conversation reached a definitive end state" category = "validation" diff --git a/src/eva/metrics/validation/user_behavioral_fidelity.py b/src/eva/metrics/validation/user_behavioral_fidelity.py index 56bd4da4..0af13816 100644 --- a/src/eva/metrics/validation/user_behavioral_fidelity.py +++ b/src/eva/metrics/validation/user_behavioral_fidelity.py @@ -65,6 +65,7 @@ class UserBehavioralFidelityMetric(ConversationTextJudgeMetric): """ name = "user_behavioral_fidelity" + version = "v0.1" description = "Validation metric for simulated user corruption detection" category = "validation" rating_scale = (0, 1) diff --git a/src/eva/metrics/validation/user_speech_fidelity.py b/src/eva/metrics/validation/user_speech_fidelity.py index 3f605d32..a11fc8a6 100644 --- a/src/eva/metrics/validation/user_speech_fidelity.py +++ b/src/eva/metrics/validation/user_speech_fidelity.py @@ -14,6 +14,7 @@ class UserSpeechFidelityMetric(SpeechFidelityBaseMetric): """ name = "user_speech_fidelity" + version = "v0.1" description = "Audio-based validation of user speech fidelity to the intended text" category = "validation" role = "user" diff --git a/src/eva/models/results.py b/src/eva/models/results.py index ebfe9ad3..cf9e504e 100644 --- a/src/eva/models/results.py +++ b/src/eva/models/results.py @@ -4,7 +4,9 @@ from datetime import datetime from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator + +from eva.models.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH class ErrorDetails(BaseModel): @@ -99,10 +101,25 @@ class MetricScore(BaseModel): description="Metric implementation version (set by the metric class) for tracking which " "computation logic produced this score across partial reruns", ) + prompt_hash: str | None = Field( + None, + description="sha256[:12] of the unrendered judge prompt template; None for non-judge metrics. " + "Lets us detect prompt edits without relying on the metric author to bump `version`.", + ) sub_metrics: dict[str, "MetricScore"] | None = Field( None, description="Optional sub-metric breakdowns, aggregated generically by the runner" ) + @model_validator(mode="after") + def _auto_stamp_version_and_hash(self) -> "MetricScore": + # Only fill if unset, so deserialization from disk preserves historical values + # and explicit kwargs (e.g., tests) always win. + if self.version is None: + self.version = _CURRENT_METRIC_VERSION.get() + if self.prompt_hash is None: + self.prompt_hash = _CURRENT_PROMPT_HASH.get() + return self + class PassAtKResult(BaseModel): """pass@k and pass^k result for a single metric across multiple trials.""" diff --git a/src/eva/models/versioning.py b/src/eva/models/versioning.py new file mode 100644 index 00000000..24ca9f9b --- /dev/null +++ b/src/eva/models/versioning.py @@ -0,0 +1,24 @@ +"""Per-record version + prompt-hash stamping for MetricScore. + +MetricsRunner sets these contextvars around every metric.compute() call. +The MetricScore Pydantic model has a model_validator that reads them and +auto-fills the version/prompt_hash fields when unset, so all scores and +sub-scores built inside that compute() inherit the right values without +each call site having to thread them through explicitly. + +Both contextvars default to None, which means "not currently inside a +metric compute() call" — that's the state during JSON deserialization +(loading metrics.json from disk), so existing on-disk values are +preserved instead of being overwritten with None. +""" + +import hashlib +from contextvars import ContextVar + +_CURRENT_METRIC_VERSION: ContextVar[str | None] = ContextVar("current_metric_version", default=None) +_CURRENT_PROMPT_HASH: ContextVar[str | None] = ContextVar("current_prompt_hash", default=None) + + +def hash_prompt_template(template: str) -> str: + """Return sha256[:12] of an unrendered prompt template string.""" + return hashlib.sha256(template.encode("utf-8")).hexdigest()[:12] diff --git a/src/eva/utils/prompt_manager.py b/src/eva/utils/prompt_manager.py index c67db941..a4f30d21 100644 --- a/src/eva/utils/prompt_manager.py +++ b/src/eva/utils/prompt_manager.py @@ -81,6 +81,24 @@ def _load_from_directory(self, directory: Path) -> None: for yaml_file in yaml_files: self._load_single_file(yaml_file) + def get_template(self, path: str) -> str: + """Return the unrendered prompt template at `path` (no variable substitution). + + Used for hashing prompt templates so we can detect prompt edits across + runs without the per-record variable substitutions changing the hash. + """ + parts = path.split(".") + value = self.prompts + for part in parts: + if not isinstance(value, dict): + raise KeyError(f"Invalid prompt path: {path} (stopped at {part})") + if part not in value: + raise KeyError(f"Prompt not found: {path} (missing key: {part})") + value = value[part] + if not isinstance(value, str): + raise ValueError(f"Prompt at {path} is not a string: {type(value)}") + return value + def get_prompt(self, path: str, **variables) -> str: """Get a prompt by its path and substitute variables. diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json new file mode 100644 index 00000000..fc79f392 --- /dev/null +++ b/tests/fixtures/metric_signatures.json @@ -0,0 +1,104 @@ +{ + "AgentSpeechFidelityMetric": { + "name": "agent_speech_fidelity", + "prompt_hash": "864be78919d2", + "source_hash": "77743114e9b0", + "version": "v0.1" + }, + "AgentSpeechFidelityS2SMetric": { + "name": "agent_speech_fidelity", + "prompt_hash": "864be78919d2", + "source_hash": "5b3deb4968cd", + "version": "v0.1" + }, + "AuthenticationSuccessMetric": { + "name": "authentication_success", + "prompt_hash": null, + "source_hash": "cdc7c59d2684", + "version": "v0.1" + }, + "ConcisenessJudgeMetric": { + "name": "conciseness", + "prompt_hash": "5d033338d36a", + "source_hash": "cd0ea09a9613", + "version": "v0.1" + }, + "ConversationCorrectlyFinishedMetric": { + "name": "conversation_correctly_finished", + "prompt_hash": null, + "source_hash": "4f27cacab7d2", + "version": "v0.1" + }, + "ConversationProgressionJudgeMetric": { + "name": "conversation_progression", + "prompt_hash": "f3240185faf6", + "source_hash": "91b71c803d77", + "version": "v0.1" + }, + "ConversationValidEndMetric": { + "name": "conversation_valid_end", + "prompt_hash": null, + "source_hash": "02d1c3fb330b", + "version": "v0.1" + }, + "FaithfulnessJudgeMetric": { + "name": "faithfulness", + "prompt_hash": "1add7d47362c", + "source_hash": "8e3fd6bc9960", + "version": "v0.1" + }, + "ResponseSpeedMetric": { + "name": "response_speed", + "prompt_hash": null, + "source_hash": "ebce1a06bf30", + "version": "v0.1" + }, + "STTWERMetric": { + "name": "stt_wer", + "prompt_hash": null, + "source_hash": "01fcfbc1cf21", + "version": "v0.1" + }, + "SpeakabilityJudgeMetric": { + "name": "speakability", + "prompt_hash": "cd2cc44fc96c", + "source_hash": "187ddf9dc0da", + "version": "v0.1" + }, + "TaskCompletion": { + "name": "task_completion", + "prompt_hash": null, + "source_hash": "01aed1a552f4", + "version": "v0.1" + }, + "ToolCallValidity": { + "name": "tool_call_validity", + "prompt_hash": null, + "source_hash": "1572b16592fe", + "version": "v0.1" + }, + "TranscriptionAccuracyKeyEntitiesMetric": { + "name": "transcription_accuracy_key_entities", + "prompt_hash": "c0980ff2168d", + "source_hash": "a83a699d0fda", + "version": "v0.1" + }, + "TurnTakingMetric": { + "name": "turn_taking", + "prompt_hash": null, + "source_hash": "aa574674ff83", + "version": "v0.1" + }, + "UserBehavioralFidelityMetric": { + "name": "user_behavioral_fidelity", + "prompt_hash": "06477144c28e", + "source_hash": "af8144bd7731", + "version": "v0.1" + }, + "UserSpeechFidelityMetric": { + "name": "user_speech_fidelity", + "prompt_hash": "c4d97e36b865", + "source_hash": "e38e8c162b3d", + "version": "v0.1" + } +} diff --git a/tests/unit/metrics/test_metric_signatures.py b/tests/unit/metrics/test_metric_signatures.py new file mode 100644 index 00000000..24ce61c5 --- /dev/null +++ b/tests/unit/metrics/test_metric_signatures.py @@ -0,0 +1,83 @@ +"""Drift test: fail when a metric's source or prompt changes without a version bump. + +Each concrete metric class has three signature fields: + - version: manually bumped string on the class + - source_hash: sha256[:12] of inspect.getsource(cls) + - prompt_hash: sha256[:12] of judge.{name}.user_prompt template (None for code metrics) + +The fixture at tests/fixtures/metric_signatures.json is the source of truth for +the *currently-released* state. The test compares the current signatures to it +and reports drift. Authors update the fixture by running +`python scripts/regen_metric_signatures.py` after bumping `version`. + +Failure modes the test catches: + - source_hash changed, version unchanged → "bump version then regen fixture" + - prompt_hash changed, version unchanged → "bump version then regen fixture" + - version bumped → "regen fixture" (caught when source/prompt also still drift) + - new metric class with no fixture entry → "add to fixture via regen" + - metric removed from fixture → "delete from fixture via regen" +""" + +import json +from pathlib import Path + +import pytest + +from eva.metrics.signatures import compute_all_metric_signatures + +FIXTURE_PATH = Path(__file__).resolve().parents[3] / "tests" / "fixtures" / "metric_signatures.json" +REGEN_HINT = "Run `python scripts/regen_metric_signatures.py` after bumping `version` on affected classes." + + +@pytest.fixture(scope="module") +def fixture_signatures() -> dict[str, dict[str, str | None]]: + return json.loads(FIXTURE_PATH.read_text()) + + +@pytest.fixture(scope="module") +def current_signatures() -> dict[str, dict[str, str | None]]: + return compute_all_metric_signatures() + + +def test_no_unannounced_metric_drift( + fixture_signatures: dict[str, dict[str, str | None]], + current_signatures: dict[str, dict[str, str | None]], +) -> None: + """Fail if any metric's source/prompt changed without its version being bumped.""" + failures: list[str] = [] + + for qualname, current in current_signatures.items(): + recorded = fixture_signatures.get(qualname) + if recorded is None: + failures.append(f"{qualname}: new metric class not in fixture. {REGEN_HINT}") + continue + + version_changed = current["version"] != recorded["version"] + source_changed = current["source_hash"] != recorded["source_hash"] + prompt_changed = current["prompt_hash"] != recorded["prompt_hash"] + + if not (source_changed or prompt_changed or version_changed): + continue # fully in sync + + if version_changed: + # Author bumped version; they still need to regen the fixture so + # future drift is detected against the new baseline. + failures.append(f"{qualname}: version bumped ({recorded['version']} → {current['version']}). {REGEN_HINT}") + continue + + # Code or prompt changed but version is unchanged — the case the test + # exists to catch. + what = [] + if source_changed: + what.append(f"source ({recorded['source_hash']} → {current['source_hash']})") + if prompt_changed: + what.append(f"prompt ({recorded['prompt_hash']} → {current['prompt_hash']})") + failures.append( + f"{qualname}: {' and '.join(what)} changed but version still {current['version']!r}. " + f"Bump `version` on the class, then run regen." + ) + + for qualname in fixture_signatures.keys() - current_signatures.keys(): + failures.append(f"{qualname}: removed from code but still in fixture. {REGEN_HINT}") + + assert not failures, "Metric signature drift detected:\n " + "\n ".join(failures) diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py index d1d587ad..fec32d0d 100644 --- a/tests/unit/metrics/test_runner.py +++ b/tests/unit/metrics/test_runner.py @@ -615,6 +615,96 @@ async def test_rerun_partial_success(self, tmp_path): assert result.metrics["m_a"].error is None assert result.metrics["m_b"].error == "still failing" + @pytest.mark.asyncio + async def test_rerun_preserves_version_of_non_rerun_metrics(self, tmp_path): + """Non-rerun metrics keep their on-disk version; only the rerun metric gets the new version. + + Models the workflow where a user bumps `m_b`'s class version (e.g., to v0.2) and + then reruns only the errored records for m_b. m_a wasn't touched, so its on-disk + version must survive untouched. + """ + run_dir = _setup_run_dir(tmp_path, ["rec-0"]) + records = [_make_record("rec-0")] + record_dir = run_dir / "records" / "rec-0" + + # On disk: m_a succeeded at v0.1, m_b failed at v0.1 (and we want to rerun m_b) + _write_metrics_json( + record_dir, + "rec-0", + { + "m_a": MetricScore(name="m_a", score=0.9, normalized_score=0.9, version="v0.1"), + "m_b": MetricScore(name="m_b", score=0.0, normalized_score=0.0, error="fail-b", version="v0.1"), + }, + ) + + runner = _make_runner( + run_dir, + records, + ["m_a", "m_b"], + record_metric_filter={"rec-0": {"m_b"}}, + ) + # Simulate that m_b's class version was bumped to v0.2 between runs + _install_mock( + runner, + { + "rec-0": { + "m_b": MetricScore(name="m_b", score=0.7, normalized_score=0.7, version="v0.2"), + }, + }, + ) + + result = await runner.run_and_save_record("rec-0", record_dir) + + # In-memory result preserves on-disk version for m_a, stamps new version on m_b + assert result.metrics["m_a"].version == "v0.1", "m_a version must not change on partial rerun" + assert result.metrics["m_b"].version == "v0.2", "m_b should be re-stamped with the new version" + + # Persisted to disk identically + on_disk = json.loads((record_dir / "metrics.json").read_text())["metrics"] + assert on_disk["m_a"]["version"] == "v0.1" + assert on_disk["m_b"]["version"] == "v0.2" + + @pytest.mark.asyncio + async def test_rerun_preserves_legacy_unversioned_metrics(self, tmp_path): + """Pre-versioning rows (no `version` on disk) stay `version=None` after a partial rerun.""" + run_dir = _setup_run_dir(tmp_path, ["rec-0"]) + records = [_make_record("rec-0")] + record_dir = run_dir / "records" / "rec-0" + + # Write a metrics.json by hand without the version field (pre-versioning format) + legacy_blob = { + "record_id": "rec-0", + "metrics": { + "m_a": {"name": "m_a", "score": 0.5, "normalized_score": 0.5, "details": {}}, + "m_b": { + "name": "m_b", + "score": 0.0, + "normalized_score": 0.0, + "error": "fail-b", + "details": {}, + }, + }, + } + (record_dir / "metrics.json").write_text(json.dumps(legacy_blob)) + + runner = _make_runner( + run_dir, + records, + ["m_a", "m_b"], + record_metric_filter={"rec-0": {"m_b"}}, + ) + _install_mock( + runner, + {"rec-0": {"m_b": MetricScore(name="m_b", score=0.7, normalized_score=0.7, version="v0.2")}}, + ) + + result = await runner.run_and_save_record("rec-0", record_dir) + + # m_a was not rerun, so its legacy unversioned state must survive + assert result.metrics["m_a"].version is None + # m_b was rerun with the bumped class version + assert result.metrics["m_b"].version == "v0.2" + @pytest.mark.asyncio async def test_same_metric_fails_on_two_records(self, tmp_path): """The same metric failing on two different records is rerun independently on each.""" From 6d66128b4a6801469099d59c9bf5c5a0794bb3f7 Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Wed, 13 May 2026 14:18:36 -0400 Subject: [PATCH 3/8] Remove duplicated code in `get_prompt()` --- src/eva/utils/prompt_manager.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/src/eva/utils/prompt_manager.py b/src/eva/utils/prompt_manager.py index a4f30d21..1060c41e 100644 --- a/src/eva/utils/prompt_manager.py +++ b/src/eva/utils/prompt_manager.py @@ -84,19 +84,26 @@ def _load_from_directory(self, directory: Path) -> None: def get_template(self, path: str) -> str: """Return the unrendered prompt template at `path` (no variable substitution). - Used for hashing prompt templates so we can detect prompt edits across + Args: + path: Dot-separated path to the prompt (e.g., "orchestrator.system_prompt") + + Used as is for hashing prompt templates so we can detect prompt edits across runs without the per-record variable substitutions changing the hash. """ + # Navigate to the prompt using the dot-separated path parts = path.split(".") value = self.prompts + for part in parts: if not isinstance(value, dict): raise KeyError(f"Invalid prompt path: {path} (stopped at {part})") if part not in value: raise KeyError(f"Prompt not found: {path} (missing key: {part})") value = value[part] + if not isinstance(value, str): raise ValueError(f"Prompt at {path} is not a string: {type(value)}") + return value def get_prompt(self, path: str, **variables) -> str: @@ -113,19 +120,7 @@ def get_prompt(self, path: str, **variables) -> str: KeyError: If the prompt path is not found ValueError: If the prompt is not a string """ - # Navigate to the prompt using the dot-separated path - parts = path.split(".") - value = self.prompts - - for part in parts: - if not isinstance(value, dict): - raise KeyError(f"Invalid prompt path: {path} (stopped at {part})") - if part not in value: - raise KeyError(f"Prompt not found: {path} (missing key: {part})") - value = value[part] - - if not isinstance(value, str): - raise ValueError(f"Prompt at {path} is not a string: {type(value)}") + value = self.get_template(path) # Substitute variables using str.format() # Auto-inject global variables from the _shared section (prompt-level vars take precedence) From bd30a7c3d31376c179b1e60f913fbb17845af04e Mon Sep 17 00:00:00 2001 From: Joseph Marinier Date: Wed, 13 May 2026 14:19:42 -0400 Subject: [PATCH 4/8] Remove unnecessary default value --- src/eva/models/versioning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eva/models/versioning.py b/src/eva/models/versioning.py index 24ca9f9b..88908a94 100644 --- a/src/eva/models/versioning.py +++ b/src/eva/models/versioning.py @@ -21,4 +21,4 @@ def hash_prompt_template(template: str) -> str: """Return sha256[:12] of an unrendered prompt template string.""" - return hashlib.sha256(template.encode("utf-8")).hexdigest()[:12] + return hashlib.sha256(template.encode()).hexdigest()[:12] From 1b3602d7db13349b79c8d95c2c628b23ea5346d6 Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Wed, 13 May 2026 15:28:44 -0400 Subject: [PATCH 5/8] Move src/eva/models/versioning.py to src/eva/metrics/ --- src/eva/metrics/base.py | 2 +- src/eva/metrics/runner.py | 2 +- src/eva/metrics/signatures.py | 2 +- src/eva/{models => metrics}/versioning.py | 0 src/eva/models/results.py | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename src/eva/{models => metrics}/versioning.py (100%) diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py index f05563e5..7935f589 100644 --- a/src/eva/metrics/base.py +++ b/src/eva/metrics/base.py @@ -22,9 +22,9 @@ resolve_turn_id, validate_rating, ) +from eva.metrics.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template from eva.models.config import PipelineType from eva.models.results import MetricScore -from eva.models.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template from eva.utils.llm_client import LLMClient from eva.utils.logging import get_logger from eva.utils.prompt_manager import get_prompt_manager diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index 32a5b26b..2744db52 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -16,10 +16,10 @@ from eva.metrics.processor import MetricsContextProcessor from eva.metrics.registry import MetricRegistry, get_global_registry from eva.metrics.utils import direction_for_sub_metric +from eva.metrics.versioning import _CURRENT_METRIC_VERSION from eva.models.config import PipelineType, get_pipeline_type from eva.models.record import EvaluationRecord from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics -from eva.models.versioning import _CURRENT_METRIC_VERSION from eva.utils.hash_utils import get_dict_hash from eva.utils.logging import get_logger from eva.utils.pass_at_k import ( diff --git a/src/eva/metrics/signatures.py b/src/eva/metrics/signatures.py index 1b5e9e98..f7ca1f36 100644 --- a/src/eva/metrics/signatures.py +++ b/src/eva/metrics/signatures.py @@ -20,7 +20,7 @@ import eva.metrics.experience # noqa: F401 import eva.metrics.validation # noqa: F401 from eva.metrics.base import AudioJudgeMetric, BaseMetric, TextJudgeMetric -from eva.models.versioning import hash_prompt_template +from eva.metrics.versioning import hash_prompt_template from eva.utils.prompt_manager import get_prompt_manager diff --git a/src/eva/models/versioning.py b/src/eva/metrics/versioning.py similarity index 100% rename from src/eva/models/versioning.py rename to src/eva/metrics/versioning.py diff --git a/src/eva/models/results.py b/src/eva/models/results.py index cf9e504e..0558c9dc 100644 --- a/src/eva/models/results.py +++ b/src/eva/models/results.py @@ -6,7 +6,7 @@ from pydantic import BaseModel, Field, model_validator -from eva.models.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH +from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH class ErrorDetails(BaseModel): From 0cb0f5e5cb70118a0590ddb773a2ba98f18413b0 Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Wed, 13 May 2026 15:28:52 -0400 Subject: [PATCH 6/8] Regenerate metric signatures in pre-commit --- .pre-commit-config.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 96b94736..b67a4670 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,19 @@ repos: - id: end-of-file-fixer - repo: local hooks: + - id: regen-metric-signatures + name: Regenerate metric signatures + entry: python3 scripts/regen_metric_signatures.py + language: system + pass_filenames: false + files: |- + (?x: + ^configs/prompts/judge\.yaml$ + | + ^scripts/regen_metric_signatures\.py$ + | + ^src/eva/metrics/ + ) - id: check-version-bump name: Check simulation/metrics version bump entry: python3 scripts/check_version_bump.py From c692de98e27f68025670b56c7a29ab731aa92c6d Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Wed, 13 May 2026 15:39:05 -0400 Subject: [PATCH 7/8] Avoid circular import --- src/eva/models/results.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/eva/models/results.py b/src/eva/models/results.py index 0558c9dc..0caa7ab6 100644 --- a/src/eva/models/results.py +++ b/src/eva/models/results.py @@ -6,8 +6,6 @@ from pydantic import BaseModel, Field, model_validator -from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH - class ErrorDetails(BaseModel): """Detailed error information.""" @@ -114,10 +112,15 @@ class MetricScore(BaseModel): def _auto_stamp_version_and_hash(self) -> "MetricScore": # Only fill if unset, so deserialization from disk preserves historical values # and explicit kwargs (e.g., tests) always win. - if self.version is None: - self.version = _CURRENT_METRIC_VERSION.get() - if self.prompt_hash is None: - self.prompt_hash = _CURRENT_PROMPT_HASH.get() + if self.version is None or self.prompt_hash is None: + # Lazy import to avoid circular dependency: + # eva.models.results -> eva.metrics -> ... -> eva.metrics.utils -> eva.models.results + from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH + + if self.version is None: + self.version = _CURRENT_METRIC_VERSION.get() + if self.prompt_hash is None: + self.prompt_hash = _CURRENT_PROMPT_HASH.get() return self From 1d8a8d77f607dae0b1f8c546558e73b67940f44c Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Wed, 13 May 2026 15:52:13 -0400 Subject: [PATCH 8/8] Specify `uv run` in `pre-commit` --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b67a4670..2bf5ef82 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: hooks: - id: regen-metric-signatures name: Regenerate metric signatures - entry: python3 scripts/regen_metric_signatures.py + entry: uv run python scripts/regen_metric_signatures.py language: system pass_filenames: false files: |- @@ -30,7 +30,7 @@ repos: ) - id: check-version-bump name: Check simulation/metrics version bump - entry: python3 scripts/check_version_bump.py + entry: uv run python scripts/check_version_bump.py language: system pass_filenames: false always_run: true