From 7d18efe79111e3d4b83a8903ada39b98a6be65da Mon Sep 17 00:00:00 2001
From: Gabrielle Gauthier-Melancon <gabrielle.gm@servicenow.com>
Date: Sun, 3 May 2026 21:48:14 -0400
Subject: [PATCH 1/8] Stamp turn_taking MetricScore outputs with a version
 string
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an optional `version` field to MetricScore and wire turn_taking to
populate it from a `version = "v0.1"` class variable at every output site
(main score, missed-turn early return, sub-metrics). This lets us tell,
across partial metric reruns, which computation logic produced a given
row — bump the class var when the algorithm changes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/eva/metrics/experience/turn_taking.py | 5 +++++
 src/eva/models/results.py                 | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/src/eva/metrics/experience/turn_taking.py b/src/eva/metrics/experience/turn_taking.py
index 08034584..2c09c877 100644
--- a/src/eva/metrics/experience/turn_taking.py
+++ b/src/eva/metrics/experience/turn_taking.py
@@ -58,6 +58,7 @@ class TurnTakingMetric(CodeMetric):
     description = "Turn-taking evaluation based on per-turn latency and interruption behavior"
     category = "experience"
     pass_at_k_threshold = 0.8
+    version = "v0.1"
 
     # --- Latency curve (piecewise linear). 0 outside [LATENCY_HARD_EARLY_MS, LATENCY_HARD_LATE_MS]. ---
     # Ramp up 0 → 1 from LATENCY_HARD_EARLY_MS to LATENCY_SWEET_SPOT_LOW_MS.
@@ -324,6 +325,7 @@ def _wrap(key: str, value: float, normalized: bool) -> MetricScore:
                 name=f"{cls.name}.{key}",
                 score=value,
                 normalized_score=value if normalized else None,
+                version=cls.version,
             )
 
         # --- Latency ---
@@ -380,6 +382,7 @@ def _pct(p: float) -> float:
             name=f"{cls.name}.agent_interruption.num_interruptions",
             score=float(sum(n_segs_list)) if n_segs_list else None,
             normalized_score=None,
+            version=cls.version,
         )
         if overlap_ms_list:
             sub["agent_interruption.mean_overlap_ms"] = _wrap(
@@ -481,6 +484,7 @@ async def compute(self, context: MetricContext) -> MetricScore:
                     score=0.0,
                     normalized_score=0.0,
                     details=details,
+                    version=self.version,
                 )
 
             score = 0.0 if missed_turn else round(statistics.mean(per_turn_score.values()), 4)
@@ -492,6 +496,7 @@ async def compute(self, context: MetricContext) -> MetricScore:
                 normalized_score=score,
                 details=details,
                 sub_metrics=sub_metrics,
+                version=self.version,
             )
 
         except Exception as e:
diff --git a/src/eva/models/results.py b/src/eva/models/results.py
index 993184db..ebfe9ad3 100644
--- a/src/eva/models/results.py
+++ b/src/eva/models/results.py
@@ -94,6 +94,11 @@ class MetricScore(BaseModel):
         False,
         description="True when the metric had no applicable data to score (distinct from errored)",
     )
+    version: str | None = Field(
+        None,
+        description="Metric implementation version (set by the metric class) for tracking which "
+        "computation logic produced this score across partial reruns",
+    )
     sub_metrics: dict[str, "MetricScore"] | None = Field(
         None, description="Optional sub-metric breakdowns, aggregated generically by the runner"
     )

From 006cc8ce527338f6dba6a4e9c6162a7807e078dd Mon Sep 17 00:00:00 2001
From: Gabrielle Gauthier-Melancon <gabrielle.gm@servicenow.com>
Date: Tue, 12 May 2026 13:17:27 -0400
Subject: [PATCH 2/8] Auto-stamp version + prompt_hash on every MetricScore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Generalize the turn_taking-only version stamp to all metrics, and add an
automatic per-judge prompt_hash so prompt edits are detectable even
without a manual version bump. Stamping happens centrally via a Pydantic
model_validator that reads two contextvars set by MetricsRunner before
each metric.compute() call — metric authors only declare
`version = "v0.1"` on the class and the rest is automatic at every
MetricScore call site (no per-site `version=self.version` plumbing).

The contextvar approach is per-asyncio-task, so concurrent metrics in
the same record don't bleed values into each other. On partial reruns,
metrics that aren't recomputed keep whatever version/prompt_hash was on
disk — the validator only fills when the field is unset, so deserialized
historical rows are preserved.

prompt_hash is the sha256[:12] of the *unrendered* template (so per-
record variable substitutions don't change the hash). PromptManager
gains `get_template(path)` to expose the raw YAML template; BaseMetric.
get_judge_prompt() pushes the hash into the contextvar each call.

Drift test (tests/unit/metrics/test_metric_signatures.py) compares each
concrete metric class's (version, source_hash, prompt_hash) against
tests/fixtures/metric_signatures.json. Authors run
`python scripts/regen_metric_signatures.py` to refresh the fixture after
a deliberate version bump or prompt edit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/regen_metric_signatures.py            |  30 +++++
 .../metrics/accuracy/agent_speech_fidelity.py |   1 +
 .../accuracy/agent_speech_fidelity_s2s.py     |   1 +
 src/eva/metrics/accuracy/faithfulness.py      |   1 +
 src/eva/metrics/accuracy/task_completion.py   |   1 +
 src/eva/metrics/base.py                       |  11 +-
 .../diagnostic/authentication_success.py      |   1 +
 .../conversation_correctly_finished.py        |   1 +
 src/eva/metrics/diagnostic/response_speed.py  |   1 +
 src/eva/metrics/diagnostic/speakability.py    |   1 +
 src/eva/metrics/diagnostic/stt_wer.py         |   1 +
 .../metrics/diagnostic/tool_call_validity.py  |   1 +
 .../transcription_accuracy_key_entities.py    |   1 +
 src/eva/metrics/experience/conciseness.py     |   1 +
 .../experience/conversation_progression.py    |   1 +
 src/eva/metrics/experience/turn_taking.py     |   4 -
 src/eva/metrics/runner.py                     |   4 +
 src/eva/metrics/signatures.py                 |  78 +++++++++++++
 .../validation/conversation_valid_end.py      |   1 +
 .../validation/user_behavioral_fidelity.py    |   1 +
 .../validation/user_speech_fidelity.py        |   1 +
 src/eva/models/results.py                     |  19 +++-
 src/eva/models/versioning.py                  |  24 ++++
 src/eva/utils/prompt_manager.py               |  18 +++
 tests/fixtures/metric_signatures.json         | 104 ++++++++++++++++++
 tests/unit/metrics/test_metric_signatures.py  |  83 ++++++++++++++
 tests/unit/metrics/test_runner.py             |  90 +++++++++++++++
 27 files changed, 475 insertions(+), 6 deletions(-)
 create mode 100644 scripts/regen_metric_signatures.py
 create mode 100644 src/eva/metrics/signatures.py
 create mode 100644 src/eva/models/versioning.py
 create mode 100644 tests/fixtures/metric_signatures.json
 create mode 100644 tests/unit/metrics/test_metric_signatures.py

diff --git a/scripts/regen_metric_signatures.py b/scripts/regen_metric_signatures.py
new file mode 100644
index 00000000..9c27e69e
--- /dev/null
+++ b/scripts/regen_metric_signatures.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+"""Regenerate tests/fixtures/metric_signatures.json.
+
+Run this after intentionally changing a metric's logic and bumping its
+`version` class attribute (or after editing its judge prompt template).
+The drift test (tests/unit/metrics/test_metric_signatures.py) compares
+the current state against this fixture and fails on any unintended drift.
+
+Usage:
+    python scripts/regen_metric_signatures.py
+"""
+
+import json
+from pathlib import Path
+
+from eva.metrics.signatures import compute_all_metric_signatures
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+FIXTURE_PATH = REPO_ROOT / "tests" / "fixtures" / "metric_signatures.json"
+
+
+def main() -> None:
+    signatures = compute_all_metric_signatures()
+    FIXTURE_PATH.parent.mkdir(parents=True, exist_ok=True)
+    FIXTURE_PATH.write_text(json.dumps(signatures, indent=2, sort_keys=True) + "\n")
+    print(f"Wrote {len(signatures)} metric signatures to {FIXTURE_PATH.relative_to(REPO_ROOT)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity.py b/src/eva/metrics/accuracy/agent_speech_fidelity.py
index a659f7c4..d84179f7 100644
--- a/src/eva/metrics/accuracy/agent_speech_fidelity.py
+++ b/src/eva/metrics/accuracy/agent_speech_fidelity.py
@@ -14,6 +14,7 @@ class AgentSpeechFidelityMetric(SpeechFidelityBaseMetric):
     """
 
     name = "agent_speech_fidelity"
+    version = "v0.1"
     description = "Audio-based evaluation of agent speech fidelity to the intended text"
     category = "accuracy"
     role = "assistant"
diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py
index c6d43bb6..2a9706cf 100644
--- a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py
+++ b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py
@@ -25,6 +25,7 @@ class AgentSpeechFidelityS2SMetric(SpeechFidelityBaseMetric):
     """
 
     name = "agent_speech_fidelity"
+    version = "v0.1"
     description = "Audio-based evaluation of agent entity fidelity for S2S models"
     category = "accuracy"
     role = "assistant"
diff --git a/src/eva/metrics/accuracy/faithfulness.py b/src/eva/metrics/accuracy/faithfulness.py
index 3a85d29e..e6743792 100644
--- a/src/eva/metrics/accuracy/faithfulness.py
+++ b/src/eva/metrics/accuracy/faithfulness.py
@@ -54,6 +54,7 @@ class FaithfulnessJudgeMetric(ConversationTextJudgeMetric):
     """
 
     name = "faithfulness"
+    version = "v0.1"
     description = (
         "LLM judge evaluation of whether the assistant remains faithful to information, policies, and instructions"
     )
diff --git a/src/eva/metrics/accuracy/task_completion.py b/src/eva/metrics/accuracy/task_completion.py
index f1cbe8d0..2ca780b8 100644
--- a/src/eva/metrics/accuracy/task_completion.py
+++ b/src/eva/metrics/accuracy/task_completion.py
@@ -35,6 +35,7 @@ class TaskCompletion(BaseMetric):
     """
 
     name = "task_completion"
+    version = "v0.1"
     description = "Binary task completion via scenario DB state hash comparison"
     category = "accuracy"
     metric_type = MetricType.CODE
diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py
index 45a4a501..f05563e5 100644
--- a/src/eva/metrics/base.py
+++ b/src/eva/metrics/base.py
@@ -24,6 +24,7 @@
 )
 from eva.models.config import PipelineType
 from eva.models.results import MetricScore
+from eva.models.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template
 from eva.utils.llm_client import LLMClient
 from eva.utils.logging import get_logger
 from eva.utils.prompt_manager import get_prompt_manager
@@ -163,6 +164,9 @@ class BaseMetric(ABC):
     pass_at_k_threshold: float = 0.5  # Normalized score threshold for pass@k pass/fail
     exclude_from_pass_at_k: bool = False  # Set True for metrics not suitable for pass@k
     supported_pipeline_types: frozenset[PipelineType] = frozenset(PipelineType)  # Pipeline types this metric supports
+    # Bump on intentional logic changes; MetricsRunner stamps this onto every MetricScore
+    # produced by compute(). Required on all concrete subclasses — drift test enforces.
+    version: str | None = None
     # Direction of the displayed value (normalized_score if present, else score).
     # Override to False for lower-is-better parent metrics (e.g. latency). Sub-metric
     # direction is derived from the key suffix (see eva.metrics.utils.direction_for_sub_metric).
@@ -179,8 +183,13 @@ def __init__(self, config: dict[str, Any] | None = None):
         self.prompt_manager = get_prompt_manager()
 
     def get_judge_prompt(self, prompt_key: str = "user_prompt", **variables) -> str:
-        """Get judge prompt using PromptManager."""
+        """Get judge prompt using PromptManager.
+
+        Stamps the unrendered template's sha256[:12] into the prompt-hash contextvar so
+        any MetricScore built afterwards in the same compute() picks it up automatically.
+        """
         prompt_path = f"judge.{self.name}.{prompt_key}"
+        _CURRENT_PROMPT_HASH.set(hash_prompt_template(self.prompt_manager.get_template(prompt_path)))
         return self.prompt_manager.get_prompt(prompt_path, **variables)
 
     @abstractmethod
diff --git a/src/eva/metrics/diagnostic/authentication_success.py b/src/eva/metrics/diagnostic/authentication_success.py
index 80f93905..0662231b 100644
--- a/src/eva/metrics/diagnostic/authentication_success.py
+++ b/src/eva/metrics/diagnostic/authentication_success.py
@@ -45,6 +45,7 @@ class AuthenticationSuccessMetric(CodeMetric):
     """
 
     name = "authentication_success"
+    version = "v0.1"
     description = "Checks if session state in final DB is a superset of expected session"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/conversation_correctly_finished.py b/src/eva/metrics/diagnostic/conversation_correctly_finished.py
index 6ce5cc49..c38adb7d 100644
--- a/src/eva/metrics/diagnostic/conversation_correctly_finished.py
+++ b/src/eva/metrics/diagnostic/conversation_correctly_finished.py
@@ -11,6 +11,7 @@ class ConversationCorrectlyFinishedMetric(CodeMetric):
     """0.0 when the agent timed out on the user's final turn; 1.0 otherwise."""
 
     name = "conversation_correctly_finished"
+    version = "v0.1"
     description = "Diagnostic metric: 0.0 when agent failed to respond to the user's final turn"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
index a7724b07..56571e2c 100644
--- a/src/eva/metrics/diagnostic/response_speed.py
+++ b/src/eva/metrics/diagnostic/response_speed.py
@@ -60,6 +60,7 @@ class ResponseSpeedMetric(CodeMetric):
     description = "Diagnostic metric: latency between user utterance end and assistant response start"
     exclude_from_pass_at_k = True
     higher_is_better = False  # Score is latency in seconds — lower is better.
+    version = "v0.1"
 
     async def compute(self, context: MetricContext) -> MetricScore:
         try:
diff --git a/src/eva/metrics/diagnostic/speakability.py b/src/eva/metrics/diagnostic/speakability.py
index bc9d8a45..f3e9c3b4 100644
--- a/src/eva/metrics/diagnostic/speakability.py
+++ b/src/eva/metrics/diagnostic/speakability.py
@@ -30,6 +30,7 @@ class SpeakabilityJudgeMetric(PerTurnConversationJudgeMetric):
     """
 
     name = "speakability"
+    version = "v0.1"
     description = "Debug metric: LLM judge evaluation of text voice-friendliness per turn"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/stt_wer.py b/src/eva/metrics/diagnostic/stt_wer.py
index 52ad5c2b..5c52e50c 100644
--- a/src/eva/metrics/diagnostic/stt_wer.py
+++ b/src/eva/metrics/diagnostic/stt_wer.py
@@ -72,6 +72,7 @@ class STTWERMetric(CodeMetric):
     """
 
     name = "stt_wer"
+    version = "v0.1"
     description = "Debug metric: Speech-to-Text transcription accuracy using Word Error Rate"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/tool_call_validity.py b/src/eva/metrics/diagnostic/tool_call_validity.py
index 12b4f7d8..01cfde95 100644
--- a/src/eva/metrics/diagnostic/tool_call_validity.py
+++ b/src/eva/metrics/diagnostic/tool_call_validity.py
@@ -39,6 +39,7 @@ class ToolCallValidity(CodeMetric):
     """
 
     name = "tool_call_validity"
+    version = "v0.1"
     description = "Debug metric: fraction of tool calls with correctly formatted parameters"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py
index ecdb79bf..4990b579 100644
--- a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py
+++ b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py
@@ -48,6 +48,7 @@ class TranscriptionAccuracyKeyEntitiesMetric(TextJudgeMetric):
     """
 
     name = "transcription_accuracy_key_entities"
+    version = "v0.1"
     description = "Debug metric: LLM judge evaluation of STT key entity transcription accuracy for entire conversation"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/experience/conciseness.py b/src/eva/metrics/experience/conciseness.py
index 3f77890e..2b4bc196 100644
--- a/src/eva/metrics/experience/conciseness.py
+++ b/src/eva/metrics/experience/conciseness.py
@@ -28,6 +28,7 @@ class ConcisenessJudgeMetric(PerTurnConversationJudgeMetric):
     """
 
     name = "conciseness"
+    version = "v0.1"
     description = "LLM judge evaluation of assistant response conciseness"
     category = "experience"
     rating_scale = (1, 3)
diff --git a/src/eva/metrics/experience/conversation_progression.py b/src/eva/metrics/experience/conversation_progression.py
index 595101e4..f5abfa80 100644
--- a/src/eva/metrics/experience/conversation_progression.py
+++ b/src/eva/metrics/experience/conversation_progression.py
@@ -32,6 +32,7 @@ class ConversationProgressionJudgeMetric(ConversationTextJudgeMetric):
     """
 
     name = "conversation_progression"
+    version = "v0.1"
     description = "LLM judge evaluation of whether the assistant moved the conversation forward productively"
     category = "experience"
     rating_scale = (1, 3)
diff --git a/src/eva/metrics/experience/turn_taking.py b/src/eva/metrics/experience/turn_taking.py
index 2c09c877..7e3da258 100644
--- a/src/eva/metrics/experience/turn_taking.py
+++ b/src/eva/metrics/experience/turn_taking.py
@@ -325,7 +325,6 @@ def _wrap(key: str, value: float, normalized: bool) -> MetricScore:
                 name=f"{cls.name}.{key}",
                 score=value,
                 normalized_score=value if normalized else None,
-                version=cls.version,
             )
 
         # --- Latency ---
@@ -382,7 +381,6 @@ def _pct(p: float) -> float:
             name=f"{cls.name}.agent_interruption.num_interruptions",
             score=float(sum(n_segs_list)) if n_segs_list else None,
             normalized_score=None,
-            version=cls.version,
         )
         if overlap_ms_list:
             sub["agent_interruption.mean_overlap_ms"] = _wrap(
@@ -484,7 +482,6 @@ async def compute(self, context: MetricContext) -> MetricScore:
                     score=0.0,
                     normalized_score=0.0,
                     details=details,
-                    version=self.version,
                 )
 
             score = 0.0 if missed_turn else round(statistics.mean(per_turn_score.values()), 4)
@@ -496,7 +493,6 @@ async def compute(self, context: MetricContext) -> MetricScore:
                 normalized_score=score,
                 details=details,
                 sub_metrics=sub_metrics,
-                version=self.version,
             )
 
         except Exception as e:
diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
index f7d945c5..32a5b26b 100644
--- a/src/eva/metrics/runner.py
+++ b/src/eva/metrics/runner.py
@@ -19,6 +19,7 @@
 from eva.models.config import PipelineType, get_pipeline_type
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
+from eva.models.versioning import _CURRENT_METRIC_VERSION
 from eva.utils.hash_utils import get_dict_hash
 from eva.utils.logging import get_logger
 from eva.utils.pass_at_k import (
@@ -448,6 +449,9 @@ async def _run_record(self, record_id: str, record_dir: Path) -> RecordMetrics:
         # Create tasks for all metrics
         async def compute_metric(metric: BaseMetric) -> tuple[str, MetricScore]:
             """Compute a single metric and handle errors."""
+            # Each gather() task gets its own contextvar snapshot, so this set is
+            # isolated from sibling/parent tasks — no reset needed.
+            _CURRENT_METRIC_VERSION.set(metric.version)
             try:
                 logger.info(f"[{record_id}] Starting metric: {metric.name}")
                 score = await metric.compute(context)
diff --git a/src/eva/metrics/signatures.py b/src/eva/metrics/signatures.py
new file mode 100644
index 00000000..1b5e9e98
--- /dev/null
+++ b/src/eva/metrics/signatures.py
@@ -0,0 +1,78 @@
+"""Compute drift signatures for metric classes.
+
+A metric's "signature" captures everything we want to detect changes to:
+ - `version`: the manually-bumped string on the class
+ - `source_hash`: sha256[:12] of `inspect.getsource(cls)` (class body)
+ - `prompt_hash`: sha256[:12] of the unrendered judge prompt template, or
+                  None for non-judge metrics
+
+The drift test compares the current signatures against a checked-in fixture
+and fails if anything changed without an explicit version bump + fixture regen.
+"""
+
+import hashlib
+import inspect
+
+# Importing the metric subpackages forces all concrete metric classes to be
+# registered as BaseMetric subclasses, so walking __subclasses__ finds them.
+import eva.metrics.accuracy  # noqa: F401
+import eva.metrics.diagnostic  # noqa: F401
+import eva.metrics.experience  # noqa: F401
+import eva.metrics.validation  # noqa: F401
+from eva.metrics.base import AudioJudgeMetric, BaseMetric, TextJudgeMetric
+from eva.models.versioning import hash_prompt_template
+from eva.utils.prompt_manager import get_prompt_manager
+
+
+def _all_concrete_versioned_metric_classes() -> dict[str, type[BaseMetric]]:
+    """Walk BaseMetric subclasses; return concrete classes that set a version.
+
+    Keyed on class qualname so co-named classes (e.g., the cascade vs S2S
+    variants of `agent_speech_fidelity`) get distinct entries.
+    """
+    result: dict[str, type[BaseMetric]] = {}
+
+    def walk(cls: type) -> None:
+        for sub in cls.__subclasses__():
+            walk(sub)
+            if inspect.isabstract(sub):
+                continue
+            # `version` is None on BaseMetric; only concrete classes that
+            # deliberately set it are participating.
+            if getattr(sub, "version", None) is None:
+                continue
+            result[sub.__qualname__] = sub
+
+    walk(BaseMetric)
+    return result
+
+
+def _source_hash(cls: type) -> str:
+    """sha256[:12] of the class body source."""
+    return hashlib.sha256(inspect.getsource(cls).encode("utf-8")).hexdigest()[:12]
+
+
+def _prompt_hash_for_metric(cls: type[BaseMetric]) -> str | None:
+    """Return the prompt template hash for judge metrics, or None.
+
+    All judge metrics in this codebase use `judge.{name}.user_prompt`.
+    A judge metric without a corresponding template raises KeyError —
+    that's a configuration bug we want surfaced.
+    """
+    if not issubclass(cls, TextJudgeMetric | AudioJudgeMetric):
+        return None
+    template = get_prompt_manager().get_template(f"judge.{cls.name}.user_prompt")
+    return hash_prompt_template(template)
+
+
+def compute_all_metric_signatures() -> dict[str, dict[str, str | None]]:
+    """Return {class_qualname: {version, source_hash, prompt_hash}} for every concrete metric."""
+    out: dict[str, dict[str, str | None]] = {}
+    for qualname, cls in _all_concrete_versioned_metric_classes().items():
+        out[qualname] = {
+            "name": cls.name,
+            "version": cls.version,
+            "source_hash": _source_hash(cls),
+            "prompt_hash": _prompt_hash_for_metric(cls),
+        }
+    return out
diff --git a/src/eva/metrics/validation/conversation_valid_end.py b/src/eva/metrics/validation/conversation_valid_end.py
index 61831f6f..f6a6ebc5 100644
--- a/src/eva/metrics/validation/conversation_valid_end.py
+++ b/src/eva/metrics/validation/conversation_valid_end.py
@@ -14,6 +14,7 @@ class ConversationValidEndMetric(CodeMetric):
     """Binary score: 1.0 when the conversation ended on goodbye OR agent-timeout-on-user-turn; 0.0 otherwise."""
 
     name = "conversation_valid_end"
+    version = "v0.1"
     description = "Validation metric: conversation reached a definitive end state"
     category = "validation"
 
diff --git a/src/eva/metrics/validation/user_behavioral_fidelity.py b/src/eva/metrics/validation/user_behavioral_fidelity.py
index 56bd4da4..0af13816 100644
--- a/src/eva/metrics/validation/user_behavioral_fidelity.py
+++ b/src/eva/metrics/validation/user_behavioral_fidelity.py
@@ -65,6 +65,7 @@ class UserBehavioralFidelityMetric(ConversationTextJudgeMetric):
     """
 
     name = "user_behavioral_fidelity"
+    version = "v0.1"
     description = "Validation metric for simulated user corruption detection"
     category = "validation"
     rating_scale = (0, 1)
diff --git a/src/eva/metrics/validation/user_speech_fidelity.py b/src/eva/metrics/validation/user_speech_fidelity.py
index 3f605d32..a11fc8a6 100644
--- a/src/eva/metrics/validation/user_speech_fidelity.py
+++ b/src/eva/metrics/validation/user_speech_fidelity.py
@@ -14,6 +14,7 @@ class UserSpeechFidelityMetric(SpeechFidelityBaseMetric):
     """
 
     name = "user_speech_fidelity"
+    version = "v0.1"
     description = "Audio-based validation of user speech fidelity to the intended text"
     category = "validation"
     role = "user"
diff --git a/src/eva/models/results.py b/src/eva/models/results.py
index ebfe9ad3..cf9e504e 100644
--- a/src/eva/models/results.py
+++ b/src/eva/models/results.py
@@ -4,7 +4,9 @@
 from datetime import datetime
 from typing import Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
+
+from eva.models.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH
 
 
 class ErrorDetails(BaseModel):
@@ -99,10 +101,25 @@ class MetricScore(BaseModel):
         description="Metric implementation version (set by the metric class) for tracking which "
         "computation logic produced this score across partial reruns",
     )
+    prompt_hash: str | None = Field(
+        None,
+        description="sha256[:12] of the unrendered judge prompt template; None for non-judge metrics. "
+        "Lets us detect prompt edits without relying on the metric author to bump `version`.",
+    )
     sub_metrics: dict[str, "MetricScore"] | None = Field(
         None, description="Optional sub-metric breakdowns, aggregated generically by the runner"
     )
 
+    @model_validator(mode="after")
+    def _auto_stamp_version_and_hash(self) -> "MetricScore":
+        # Only fill if unset, so deserialization from disk preserves historical values
+        # and explicit kwargs (e.g., tests) always win.
+        if self.version is None:
+            self.version = _CURRENT_METRIC_VERSION.get()
+        if self.prompt_hash is None:
+            self.prompt_hash = _CURRENT_PROMPT_HASH.get()
+        return self
+
 
 class PassAtKResult(BaseModel):
     """pass@k and pass^k result for a single metric across multiple trials."""
diff --git a/src/eva/models/versioning.py b/src/eva/models/versioning.py
new file mode 100644
index 00000000..24ca9f9b
--- /dev/null
+++ b/src/eva/models/versioning.py
@@ -0,0 +1,24 @@
+"""Per-record version + prompt-hash stamping for MetricScore.
+
+MetricsRunner sets these contextvars around every metric.compute() call.
+The MetricScore Pydantic model has a model_validator that reads them and
+auto-fills the version/prompt_hash fields when unset, so all scores and
+sub-scores built inside that compute() inherit the right values without
+each call site having to thread them through explicitly.
+
+Both contextvars default to None, which means "not currently inside a
+metric compute() call" — that's the state during JSON deserialization
+(loading metrics.json from disk), so existing on-disk values are
+preserved instead of being overwritten with None.
+"""
+
+import hashlib
+from contextvars import ContextVar
+
+_CURRENT_METRIC_VERSION: ContextVar[str | None] = ContextVar("current_metric_version", default=None)
+_CURRENT_PROMPT_HASH: ContextVar[str | None] = ContextVar("current_prompt_hash", default=None)
+
+
+def hash_prompt_template(template: str) -> str:
+    """Return sha256[:12] of an unrendered prompt template string."""
+    return hashlib.sha256(template.encode("utf-8")).hexdigest()[:12]
diff --git a/src/eva/utils/prompt_manager.py b/src/eva/utils/prompt_manager.py
index c67db941..a4f30d21 100644
--- a/src/eva/utils/prompt_manager.py
+++ b/src/eva/utils/prompt_manager.py
@@ -81,6 +81,24 @@ def _load_from_directory(self, directory: Path) -> None:
         for yaml_file in yaml_files:
             self._load_single_file(yaml_file)
 
+    def get_template(self, path: str) -> str:
+        """Return the unrendered prompt template at `path` (no variable substitution).
+
+        Used for hashing prompt templates so we can detect prompt edits across
+        runs without the per-record variable substitutions changing the hash.
+        """
+        parts = path.split(".")
+        value = self.prompts
+        for part in parts:
+            if not isinstance(value, dict):
+                raise KeyError(f"Invalid prompt path: {path} (stopped at {part})")
+            if part not in value:
+                raise KeyError(f"Prompt not found: {path} (missing key: {part})")
+            value = value[part]
+        if not isinstance(value, str):
+            raise ValueError(f"Prompt at {path} is not a string: {type(value)}")
+        return value
+
     def get_prompt(self, path: str, **variables) -> str:
         """Get a prompt by its path and substitute variables.
 
diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json
new file mode 100644
index 00000000..fc79f392
--- /dev/null
+++ b/tests/fixtures/metric_signatures.json
@@ -0,0 +1,104 @@
+{
+  "AgentSpeechFidelityMetric": {
+    "name": "agent_speech_fidelity",
+    "prompt_hash": "864be78919d2",
+    "source_hash": "77743114e9b0",
+    "version": "v0.1"
+  },
+  "AgentSpeechFidelityS2SMetric": {
+    "name": "agent_speech_fidelity",
+    "prompt_hash": "864be78919d2",
+    "source_hash": "5b3deb4968cd",
+    "version": "v0.1"
+  },
+  "AuthenticationSuccessMetric": {
+    "name": "authentication_success",
+    "prompt_hash": null,
+    "source_hash": "cdc7c59d2684",
+    "version": "v0.1"
+  },
+  "ConcisenessJudgeMetric": {
+    "name": "conciseness",
+    "prompt_hash": "5d033338d36a",
+    "source_hash": "cd0ea09a9613",
+    "version": "v0.1"
+  },
+  "ConversationCorrectlyFinishedMetric": {
+    "name": "conversation_correctly_finished",
+    "prompt_hash": null,
+    "source_hash": "4f27cacab7d2",
+    "version": "v0.1"
+  },
+  "ConversationProgressionJudgeMetric": {
+    "name": "conversation_progression",
+    "prompt_hash": "f3240185faf6",
+    "source_hash": "91b71c803d77",
+    "version": "v0.1"
+  },
+  "ConversationValidEndMetric": {
+    "name": "conversation_valid_end",
+    "prompt_hash": null,
+    "source_hash": "02d1c3fb330b",
+    "version": "v0.1"
+  },
+  "FaithfulnessJudgeMetric": {
+    "name": "faithfulness",
+    "prompt_hash": "1add7d47362c",
+    "source_hash": "8e3fd6bc9960",
+    "version": "v0.1"
+  },
+  "ResponseSpeedMetric": {
+    "name": "response_speed",
+    "prompt_hash": null,
+    "source_hash": "ebce1a06bf30",
+    "version": "v0.1"
+  },
+  "STTWERMetric": {
+    "name": "stt_wer",
+    "prompt_hash": null,
+    "source_hash": "01fcfbc1cf21",
+    "version": "v0.1"
+  },
+  "SpeakabilityJudgeMetric": {
+    "name": "speakability",
+    "prompt_hash": "cd2cc44fc96c",
+    "source_hash": "187ddf9dc0da",
+    "version": "v0.1"
+  },
+  "TaskCompletion": {
+    "name": "task_completion",
+    "prompt_hash": null,
+    "source_hash": "01aed1a552f4",
+    "version": "v0.1"
+  },
+  "ToolCallValidity": {
+    "name": "tool_call_validity",
+    "prompt_hash": null,
+    "source_hash": "1572b16592fe",
+    "version": "v0.1"
+  },
+  "TranscriptionAccuracyKeyEntitiesMetric": {
+    "name": "transcription_accuracy_key_entities",
+    "prompt_hash": "c0980ff2168d",
+    "source_hash": "a83a699d0fda",
+    "version": "v0.1"
+  },
+  "TurnTakingMetric": {
+    "name": "turn_taking",
+    "prompt_hash": null,
+    "source_hash": "aa574674ff83",
+    "version": "v0.1"
+  },
+  "UserBehavioralFidelityMetric": {
+    "name": "user_behavioral_fidelity",
+    "prompt_hash": "06477144c28e",
+    "source_hash": "af8144bd7731",
+    "version": "v0.1"
+  },
+  "UserSpeechFidelityMetric": {
+    "name": "user_speech_fidelity",
+    "prompt_hash": "c4d97e36b865",
+    "source_hash": "e38e8c162b3d",
+    "version": "v0.1"
+  }
+}
diff --git a/tests/unit/metrics/test_metric_signatures.py b/tests/unit/metrics/test_metric_signatures.py
new file mode 100644
index 00000000..24ce61c5
--- /dev/null
+++ b/tests/unit/metrics/test_metric_signatures.py
@@ -0,0 +1,83 @@
+"""Drift test: fail when a metric's source or prompt changes without a version bump.
+
+Each concrete metric class has three signature fields:
+  - version: manually bumped string on the class
+  - source_hash: sha256[:12] of inspect.getsource(cls)
+  - prompt_hash: sha256[:12] of judge.{name}.user_prompt template (None for code metrics)
+
+The fixture at tests/fixtures/metric_signatures.json is the source of truth for
+the *currently-released* state. The test compares the current signatures to it
+and reports drift. Authors update the fixture by running
+`python scripts/regen_metric_signatures.py` after bumping `version`.
+
+Failure modes the test catches:
+  - source_hash changed, version unchanged → "bump version then regen fixture"
+  - prompt_hash changed, version unchanged → "bump version then regen fixture"
+  - version bumped → "regen fixture" (caught when source/prompt also still drift)
+  - new metric class with no fixture entry → "add to fixture via regen"
+  - metric removed from fixture → "delete from fixture via regen"
+"""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from eva.metrics.signatures import compute_all_metric_signatures
+
+FIXTURE_PATH = Path(__file__).resolve().parents[3] / "tests" / "fixtures" / "metric_signatures.json"
+REGEN_HINT = "Run `python scripts/regen_metric_signatures.py` after bumping `version` on affected classes."
+
+
+@pytest.fixture(scope="module")
+def fixture_signatures() -> dict[str, dict[str, str | None]]:
+    return json.loads(FIXTURE_PATH.read_text())
+
+
+@pytest.fixture(scope="module")
+def current_signatures() -> dict[str, dict[str, str | None]]:
+    return compute_all_metric_signatures()
+
+
+def test_no_unannounced_metric_drift(
+    fixture_signatures: dict[str, dict[str, str | None]],
+    current_signatures: dict[str, dict[str, str | None]],
+) -> None:
+    """Fail if any metric's source/prompt changed without its version being bumped."""
+    failures: list[str] = []
+
+    for qualname, current in current_signatures.items():
+        recorded = fixture_signatures.get(qualname)
+        if recorded is None:
+            failures.append(f"{qualname}: new metric class not in fixture. {REGEN_HINT}")
+            continue
+
+        version_changed = current["version"] != recorded["version"]
+        source_changed = current["source_hash"] != recorded["source_hash"]
+        prompt_changed = current["prompt_hash"] != recorded["prompt_hash"]
+
+        if not (source_changed or prompt_changed or version_changed):
+            continue  # fully in sync
+
+        if version_changed:
+            # Author bumped version; they still need to regen the fixture so
+            # future drift is detected against the new baseline.
+            failures.append(f"{qualname}: version bumped ({recorded['version']} → {current['version']}). {REGEN_HINT}")
+            continue
+
+        # Code or prompt changed but version is unchanged — the case the test
+        # exists to catch.
+        what = []
+        if source_changed:
+            what.append(f"source ({recorded['source_hash']} → {current['source_hash']})")
+        if prompt_changed:
+            what.append(f"prompt ({recorded['prompt_hash']} → {current['prompt_hash']})")
+        failures.append(
+            f"{qualname}: {' and '.join(what)} changed but version still {current['version']!r}. "
+            f"Bump `version` on the class, then run regen."
+        )
+
+    for qualname in fixture_signatures.keys() - current_signatures.keys():
+        failures.append(f"{qualname}: removed from code but still in fixture. {REGEN_HINT}")
+
+    assert not failures, "Metric signature drift detected:\n  " + "\n  ".join(failures)
diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py
index d1d587ad..fec32d0d 100644
--- a/tests/unit/metrics/test_runner.py
+++ b/tests/unit/metrics/test_runner.py
@@ -615,6 +615,96 @@ async def test_rerun_partial_success(self, tmp_path):
         assert result.metrics["m_a"].error is None
         assert result.metrics["m_b"].error == "still failing"
 
+    @pytest.mark.asyncio
+    async def test_rerun_preserves_version_of_non_rerun_metrics(self, tmp_path):
+        """Non-rerun metrics keep their on-disk version; only the rerun metric gets the new version.
+
+        Models the workflow where a user bumps `m_b`'s class version (e.g., to v0.2) and
+        then reruns only the errored records for m_b. m_a wasn't touched, so its on-disk
+        version must survive untouched.
+        """
+        run_dir = _setup_run_dir(tmp_path, ["rec-0"])
+        records = [_make_record("rec-0")]
+        record_dir = run_dir / "records" / "rec-0"
+
+        # On disk: m_a succeeded at v0.1, m_b failed at v0.1 (and we want to rerun m_b)
+        _write_metrics_json(
+            record_dir,
+            "rec-0",
+            {
+                "m_a": MetricScore(name="m_a", score=0.9, normalized_score=0.9, version="v0.1"),
+                "m_b": MetricScore(name="m_b", score=0.0, normalized_score=0.0, error="fail-b", version="v0.1"),
+            },
+        )
+
+        runner = _make_runner(
+            run_dir,
+            records,
+            ["m_a", "m_b"],
+            record_metric_filter={"rec-0": {"m_b"}},
+        )
+        # Simulate that m_b's class version was bumped to v0.2 between runs
+        _install_mock(
+            runner,
+            {
+                "rec-0": {
+                    "m_b": MetricScore(name="m_b", score=0.7, normalized_score=0.7, version="v0.2"),
+                },
+            },
+        )
+
+        result = await runner.run_and_save_record("rec-0", record_dir)
+
+        # In-memory result preserves on-disk version for m_a, stamps new version on m_b
+        assert result.metrics["m_a"].version == "v0.1", "m_a version must not change on partial rerun"
+        assert result.metrics["m_b"].version == "v0.2", "m_b should be re-stamped with the new version"
+
+        # Persisted to disk identically
+        on_disk = json.loads((record_dir / "metrics.json").read_text())["metrics"]
+        assert on_disk["m_a"]["version"] == "v0.1"
+        assert on_disk["m_b"]["version"] == "v0.2"
+
+    @pytest.mark.asyncio
+    async def test_rerun_preserves_legacy_unversioned_metrics(self, tmp_path):
+        """Pre-versioning rows (no `version` on disk) stay `version=None` after a partial rerun."""
+        run_dir = _setup_run_dir(tmp_path, ["rec-0"])
+        records = [_make_record("rec-0")]
+        record_dir = run_dir / "records" / "rec-0"
+
+        # Write a metrics.json by hand without the version field (pre-versioning format)
+        legacy_blob = {
+            "record_id": "rec-0",
+            "metrics": {
+                "m_a": {"name": "m_a", "score": 0.5, "normalized_score": 0.5, "details": {}},
+                "m_b": {
+                    "name": "m_b",
+                    "score": 0.0,
+                    "normalized_score": 0.0,
+                    "error": "fail-b",
+                    "details": {},
+                },
+            },
+        }
+        (record_dir / "metrics.json").write_text(json.dumps(legacy_blob))
+
+        runner = _make_runner(
+            run_dir,
+            records,
+            ["m_a", "m_b"],
+            record_metric_filter={"rec-0": {"m_b"}},
+        )
+        _install_mock(
+            runner,
+            {"rec-0": {"m_b": MetricScore(name="m_b", score=0.7, normalized_score=0.7, version="v0.2")}},
+        )
+
+        result = await runner.run_and_save_record("rec-0", record_dir)
+
+        # m_a was not rerun, so its legacy unversioned state must survive
+        assert result.metrics["m_a"].version is None
+        # m_b was rerun with the bumped class version
+        assert result.metrics["m_b"].version == "v0.2"
+
     @pytest.mark.asyncio
     async def test_same_metric_fails_on_two_records(self, tmp_path):
         """The same metric failing on two different records is rerun independently on each."""

From 6d66128b4a6801469099d59c9bf5c5a0794bb3f7 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Wed, 13 May 2026 14:18:36 -0400
Subject: [PATCH 3/8] Remove duplicated code in `get_prompt()`

---
 src/eva/utils/prompt_manager.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/src/eva/utils/prompt_manager.py b/src/eva/utils/prompt_manager.py
index a4f30d21..1060c41e 100644
--- a/src/eva/utils/prompt_manager.py
+++ b/src/eva/utils/prompt_manager.py
@@ -84,19 +84,26 @@ def _load_from_directory(self, directory: Path) -> None:
     def get_template(self, path: str) -> str:
         """Return the unrendered prompt template at `path` (no variable substitution).
 
-        Used for hashing prompt templates so we can detect prompt edits across
+        Args:
+            path: Dot-separated path to the prompt (e.g., "orchestrator.system_prompt")
+
+        Used as is for hashing prompt templates so we can detect prompt edits across
         runs without the per-record variable substitutions changing the hash.
         """
+        # Navigate to the prompt using the dot-separated path
         parts = path.split(".")
         value = self.prompts
+
         for part in parts:
             if not isinstance(value, dict):
                 raise KeyError(f"Invalid prompt path: {path} (stopped at {part})")
             if part not in value:
                 raise KeyError(f"Prompt not found: {path} (missing key: {part})")
             value = value[part]
+
         if not isinstance(value, str):
             raise ValueError(f"Prompt at {path} is not a string: {type(value)}")
+
         return value
 
     def get_prompt(self, path: str, **variables) -> str:
@@ -113,19 +120,7 @@ def get_prompt(self, path: str, **variables) -> str:
             KeyError: If the prompt path is not found
             ValueError: If the prompt is not a string
         """
-        # Navigate to the prompt using the dot-separated path
-        parts = path.split(".")
-        value = self.prompts
-
-        for part in parts:
-            if not isinstance(value, dict):
-                raise KeyError(f"Invalid prompt path: {path} (stopped at {part})")
-            if part not in value:
-                raise KeyError(f"Prompt not found: {path} (missing key: {part})")
-            value = value[part]
-
-        if not isinstance(value, str):
-            raise ValueError(f"Prompt at {path} is not a string: {type(value)}")
+        value = self.get_template(path)
 
         # Substitute variables using str.format()
         # Auto-inject global variables from the _shared section (prompt-level vars take precedence)

From bd30a7c3d31376c179b1e60f913fbb17845af04e Mon Sep 17 00:00:00 2001
From: Joseph Marinier <Joseph.Marinier@gmail.com>
Date: Wed, 13 May 2026 14:19:42 -0400
Subject: [PATCH 4/8] Remove unnecessary default value

---
 src/eva/models/versioning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eva/models/versioning.py b/src/eva/models/versioning.py
index 24ca9f9b..88908a94 100644
--- a/src/eva/models/versioning.py
+++ b/src/eva/models/versioning.py
@@ -21,4 +21,4 @@
 
 def hash_prompt_template(template: str) -> str:
     """Return sha256[:12] of an unrendered prompt template string."""
-    return hashlib.sha256(template.encode("utf-8")).hexdigest()[:12]
+    return hashlib.sha256(template.encode()).hexdigest()[:12]

From 1b3602d7db13349b79c8d95c2c628b23ea5346d6 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Wed, 13 May 2026 15:28:44 -0400
Subject: [PATCH 5/8] Move src/eva/models/versioning.py to src/eva/metrics/

---
 src/eva/metrics/base.py                   | 2 +-
 src/eva/metrics/runner.py                 | 2 +-
 src/eva/metrics/signatures.py             | 2 +-
 src/eva/{models => metrics}/versioning.py | 0
 src/eva/models/results.py                 | 2 +-
 5 files changed, 4 insertions(+), 4 deletions(-)
 rename src/eva/{models => metrics}/versioning.py (100%)

diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py
index f05563e5..7935f589 100644
--- a/src/eva/metrics/base.py
+++ b/src/eva/metrics/base.py
@@ -22,9 +22,9 @@
     resolve_turn_id,
     validate_rating,
 )
+from eva.metrics.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template
 from eva.models.config import PipelineType
 from eva.models.results import MetricScore
-from eva.models.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template
 from eva.utils.llm_client import LLMClient
 from eva.utils.logging import get_logger
 from eva.utils.prompt_manager import get_prompt_manager
diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
index 32a5b26b..2744db52 100644
--- a/src/eva/metrics/runner.py
+++ b/src/eva/metrics/runner.py
@@ -16,10 +16,10 @@
 from eva.metrics.processor import MetricsContextProcessor
 from eva.metrics.registry import MetricRegistry, get_global_registry
 from eva.metrics.utils import direction_for_sub_metric
+from eva.metrics.versioning import _CURRENT_METRIC_VERSION
 from eva.models.config import PipelineType, get_pipeline_type
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
-from eva.models.versioning import _CURRENT_METRIC_VERSION
 from eva.utils.hash_utils import get_dict_hash
 from eva.utils.logging import get_logger
 from eva.utils.pass_at_k import (
diff --git a/src/eva/metrics/signatures.py b/src/eva/metrics/signatures.py
index 1b5e9e98..f7ca1f36 100644
--- a/src/eva/metrics/signatures.py
+++ b/src/eva/metrics/signatures.py
@@ -20,7 +20,7 @@
 import eva.metrics.experience  # noqa: F401
 import eva.metrics.validation  # noqa: F401
 from eva.metrics.base import AudioJudgeMetric, BaseMetric, TextJudgeMetric
-from eva.models.versioning import hash_prompt_template
+from eva.metrics.versioning import hash_prompt_template
 from eva.utils.prompt_manager import get_prompt_manager
 
 
diff --git a/src/eva/models/versioning.py b/src/eva/metrics/versioning.py
similarity index 100%
rename from src/eva/models/versioning.py
rename to src/eva/metrics/versioning.py
diff --git a/src/eva/models/results.py b/src/eva/models/results.py
index cf9e504e..0558c9dc 100644
--- a/src/eva/models/results.py
+++ b/src/eva/models/results.py
@@ -6,7 +6,7 @@
 
 from pydantic import BaseModel, Field, model_validator
 
-from eva.models.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH
+from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH
 
 
 class ErrorDetails(BaseModel):

From 0cb0f5e5cb70118a0590ddb773a2ba98f18413b0 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Wed, 13 May 2026 15:28:52 -0400
Subject: [PATCH 6/8] Regenerate metric signatures in pre-commit

---
 .pre-commit-config.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 96b94736..b67a4670 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,6 +15,19 @@ repos:
       - id: end-of-file-fixer
   - repo: local
     hooks:
+      - id: regen-metric-signatures
+        name: Regenerate metric signatures
+        entry: python3 scripts/regen_metric_signatures.py
+        language: system
+        pass_filenames: false
+        files: |-
+          (?x:
+            ^configs/prompts/judge\.yaml$
+            |
+            ^scripts/regen_metric_signatures\.py$
+            |
+            ^src/eva/metrics/
+          )
       - id: check-version-bump
         name: Check simulation/metrics version bump
         entry: python3 scripts/check_version_bump.py

From c692de98e27f68025670b56c7a29ab731aa92c6d Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Wed, 13 May 2026 15:39:05 -0400
Subject: [PATCH 7/8] Avoid circular import

---
 src/eva/models/results.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/eva/models/results.py b/src/eva/models/results.py
index 0558c9dc..0caa7ab6 100644
--- a/src/eva/models/results.py
+++ b/src/eva/models/results.py
@@ -6,8 +6,6 @@
 
 from pydantic import BaseModel, Field, model_validator
 
-from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH
-
 
 class ErrorDetails(BaseModel):
     """Detailed error information."""
@@ -114,10 +112,15 @@ class MetricScore(BaseModel):
     def _auto_stamp_version_and_hash(self) -> "MetricScore":
         # Only fill if unset, so deserialization from disk preserves historical values
         # and explicit kwargs (e.g., tests) always win.
-        if self.version is None:
-            self.version = _CURRENT_METRIC_VERSION.get()
-        if self.prompt_hash is None:
-            self.prompt_hash = _CURRENT_PROMPT_HASH.get()
+        if self.version is None or self.prompt_hash is None:
+            # Lazy import to avoid circular dependency:
+            # eva.models.results -> eva.metrics -> ... -> eva.metrics.utils -> eva.models.results
+            from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH
+
+            if self.version is None:
+                self.version = _CURRENT_METRIC_VERSION.get()
+            if self.prompt_hash is None:
+                self.prompt_hash = _CURRENT_PROMPT_HASH.get()
         return self
 
 

From 1d8a8d77f607dae0b1f8c546558e73b67940f44c Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Wed, 13 May 2026 15:52:13 -0400
Subject: [PATCH 8/8] Specify `uv run` in `pre-commit`

---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b67a4670..2bf5ef82 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
     hooks:
       - id: regen-metric-signatures
         name: Regenerate metric signatures
-        entry: python3 scripts/regen_metric_signatures.py
+        entry: uv run python scripts/regen_metric_signatures.py
         language: system
         pass_filenames: false
         files: |-
@@ -30,7 +30,7 @@ repos:
           )
       - id: check-version-bump
         name: Check simulation/metrics version bump
-        entry: python3 scripts/check_version_bump.py
+        entry: uv run python scripts/check_version_bump.py
         language: system
         pass_filenames: false
         always_run: true