ServiceNow · gabegma · May 13, 2026 · May 4, 2026 · May 12, 2026 · May 13, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,9 +15,22 @@ repos:
       - id: end-of-file-fixer
   - repo: local
     hooks:
+      - id: regen-metric-signatures
+        name: Regenerate metric signatures
+        entry: uv run python scripts/regen_metric_signatures.py
+        language: system
+        pass_filenames: false
+        files: |-
+          (?x:
+            ^configs/prompts/judge\.yaml$
+            |
+            ^scripts/regen_metric_signatures\.py$
+            |
+            ^src/eva/metrics/
+          )
       - id: check-version-bump
         name: Check simulation/metrics version bump
-        entry: python3 scripts/check_version_bump.py
+        entry: uv run python scripts/check_version_bump.py
         language: system
         pass_filenames: false
         always_run: true

diff --git a/scripts/regen_metric_signatures.py b/scripts/regen_metric_signatures.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+"""Regenerate tests/fixtures/metric_signatures.json.
+
+Run this after intentionally changing a metric's logic and bumping its
+`version` class attribute (or after editing its judge prompt template).
+The drift test (tests/unit/metrics/test_metric_signatures.py) compares
+the current state against this fixture and fails on any unintended drift.
+
+Usage:
+    python scripts/regen_metric_signatures.py
+"""
+
+import json
+from pathlib import Path
+
+from eva.metrics.signatures import compute_all_metric_signatures
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+FIXTURE_PATH = REPO_ROOT / "tests" / "fixtures" / "metric_signatures.json"
+
+
+def main() -> None:
+    signatures = compute_all_metric_signatures()
+    FIXTURE_PATH.parent.mkdir(parents=True, exist_ok=True)
+    FIXTURE_PATH.write_text(json.dumps(signatures, indent=2, sort_keys=True) + "\n")
+    print(f"Wrote {len(signatures)} metric signatures to {FIXTURE_PATH.relative_to(REPO_ROOT)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity.py b/src/eva/metrics/accuracy/agent_speech_fidelity.py
@@ -14,6 +14,7 @@ class AgentSpeechFidelityMetric(SpeechFidelityBaseMetric):
     """
 
     name = "agent_speech_fidelity"
+    version = "v0.1"
     description = "Audio-based evaluation of agent speech fidelity to the intended text"
     category = "accuracy"
     role = "assistant"

diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py
@@ -25,6 +25,7 @@ class AgentSpeechFidelityS2SMetric(SpeechFidelityBaseMetric):
     """
 
     name = "agent_speech_fidelity"
+    version = "v0.1"
     description = "Audio-based evaluation of agent entity fidelity for S2S models"
     category = "accuracy"
     role = "assistant"

diff --git a/src/eva/metrics/accuracy/faithfulness.py b/src/eva/metrics/accuracy/faithfulness.py
@@ -54,6 +54,7 @@ class FaithfulnessJudgeMetric(ConversationTextJudgeMetric):
     """
 
     name = "faithfulness"
+    version = "v0.1"
     description = (
         "LLM judge evaluation of whether the assistant remains faithful to information, policies, and instructions"
     )

diff --git a/src/eva/metrics/accuracy/task_completion.py b/src/eva/metrics/accuracy/task_completion.py
@@ -35,6 +35,7 @@ class TaskCompletion(BaseMetric):
     """
 
     name = "task_completion"
+    version = "v0.1"
     description = "Binary task completion via scenario DB state hash comparison"
     category = "accuracy"
     metric_type = MetricType.CODE

diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py
@@ -22,6 +22,7 @@
     resolve_turn_id,
     validate_rating,
 )
+from eva.metrics.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template
 from eva.models.config import PipelineType
 from eva.models.results import MetricScore
 from eva.utils.llm_client import LLMClient
@@ -163,6 +164,9 @@ class BaseMetric(ABC):
     pass_at_k_threshold: float = 0.5  # Normalized score threshold for pass@k pass/fail
     exclude_from_pass_at_k: bool = False  # Set True for metrics not suitable for pass@k
     supported_pipeline_types: frozenset[PipelineType] = frozenset(PipelineType)  # Pipeline types this metric supports
+    # Bump on intentional logic changes; MetricsRunner stamps this onto every MetricScore
+    # produced by compute(). Required on all concrete subclasses — drift test enforces.
+    version: str | None = None
     # Direction of the displayed value (normalized_score if present, else score).
     # Override to False for lower-is-better parent metrics (e.g. latency). Sub-metric
     # direction is derived from the key suffix (see eva.metrics.utils.direction_for_sub_metric).
@@ -179,8 +183,13 @@ def __init__(self, config: dict[str, Any] | None = None):
         self.prompt_manager = get_prompt_manager()
 
     def get_judge_prompt(self, prompt_key: str = "user_prompt", **variables) -> str:
-        """Get judge prompt using PromptManager."""
+        """Get judge prompt using PromptManager.
+
+        Stamps the unrendered template's sha256[:12] into the prompt-hash contextvar so
+        any MetricScore built afterwards in the same compute() picks it up automatically.
+        """
         prompt_path = f"judge.{self.name}.{prompt_key}"
+        _CURRENT_PROMPT_HASH.set(hash_prompt_template(self.prompt_manager.get_template(prompt_path)))
         return self.prompt_manager.get_prompt(prompt_path, **variables)
 
     @abstractmethod

diff --git a/src/eva/metrics/diagnostic/authentication_success.py b/src/eva/metrics/diagnostic/authentication_success.py
@@ -45,6 +45,7 @@ class AuthenticationSuccessMetric(CodeMetric):
     """
 
     name = "authentication_success"
+    version = "v0.1"
     description = "Checks if session state in final DB is a superset of expected session"
     category = "diagnostic"
     exclude_from_pass_at_k = True

diff --git a/src/eva/metrics/diagnostic/conversation_correctly_finished.py b/src/eva/metrics/diagnostic/conversation_correctly_finished.py
@@ -11,6 +11,7 @@ class ConversationCorrectlyFinishedMetric(CodeMetric):
     """0.0 when the agent timed out on the user's final turn; 1.0 otherwise."""
 
     name = "conversation_correctly_finished"
+    version = "v0.1"
     description = "Diagnostic metric: 0.0 when agent failed to respond to the user's final turn"
     category = "diagnostic"
     exclude_from_pass_at_k = True

diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
@@ -60,6 +60,7 @@ class ResponseSpeedMetric(CodeMetric):
     description = "Diagnostic metric: latency between user utterance end and assistant response start"
     exclude_from_pass_at_k = True
     higher_is_better = False  # Score is latency in seconds — lower is better.
+    version = "v0.1"
 
     async def compute(self, context: MetricContext) -> MetricScore:
         try:

diff --git a/src/eva/metrics/diagnostic/speakability.py b/src/eva/metrics/diagnostic/speakability.py
@@ -30,6 +30,7 @@ class SpeakabilityJudgeMetric(PerTurnConversationJudgeMetric):
     """
 
     name = "speakability"
+    version = "v0.1"
     description = "Debug metric: LLM judge evaluation of text voice-friendliness per turn"
     category = "diagnostic"
     exclude_from_pass_at_k = True

diff --git a/src/eva/metrics/diagnostic/stt_wer.py b/src/eva/metrics/diagnostic/stt_wer.py
@@ -72,6 +72,7 @@ class STTWERMetric(CodeMetric):
     """
 
     name = "stt_wer"
+    version = "v0.1"
     description = "Debug metric: Speech-to-Text transcription accuracy using Word Error Rate"
     category = "diagnostic"
     exclude_from_pass_at_k = True

diff --git a/src/eva/metrics/diagnostic/tool_call_validity.py b/src/eva/metrics/diagnostic/tool_call_validity.py
@@ -39,6 +39,7 @@ class ToolCallValidity(CodeMetric):
     """
 
     name = "tool_call_validity"
+    version = "v0.1"
     description = "Debug metric: fraction of tool calls with correctly formatted parameters"
     category = "diagnostic"
     exclude_from_pass_at_k = True

diff --git a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py
@@ -48,6 +48,7 @@ class TranscriptionAccuracyKeyEntitiesMetric(TextJudgeMetric):
     """
 
     name = "transcription_accuracy_key_entities"
+    version = "v0.1"
     description = "Debug metric: LLM judge evaluation of STT key entity transcription accuracy for entire conversation"
     category = "diagnostic"
     exclude_from_pass_at_k = True

diff --git a/src/eva/metrics/experience/conciseness.py b/src/eva/metrics/experience/conciseness.py
@@ -28,6 +28,7 @@ class ConcisenessJudgeMetric(PerTurnConversationJudgeMetric):
     """
 
     name = "conciseness"
+    version = "v0.1"
     description = "LLM judge evaluation of assistant response conciseness"
     category = "experience"
     rating_scale = (1, 3)

diff --git a/src/eva/metrics/experience/conversation_progression.py b/src/eva/metrics/experience/conversation_progression.py
@@ -32,6 +32,7 @@ class ConversationProgressionJudgeMetric(ConversationTextJudgeMetric):
     """
 
     name = "conversation_progression"
+    version = "v0.1"
     description = "LLM judge evaluation of whether the assistant moved the conversation forward productively"
     category = "experience"
     rating_scale = (1, 3)

diff --git a/src/eva/metrics/experience/turn_taking.py b/src/eva/metrics/experience/turn_taking.py
@@ -58,6 +58,7 @@ class TurnTakingMetric(CodeMetric):
     description = "Turn-taking evaluation based on per-turn latency and interruption behavior"
     category = "experience"
     pass_at_k_threshold = 0.8
+    version = "v0.1"
 
     # --- Latency curve (piecewise linear). 0 outside [LATENCY_HARD_EARLY_MS, LATENCY_HARD_LATE_MS]. ---
     # Ramp up 0 → 1 from LATENCY_HARD_EARLY_MS to LATENCY_SWEET_SPOT_LOW_MS.

diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
@@ -16,6 +16,7 @@
 from eva.metrics.processor import MetricsContextProcessor
 from eva.metrics.registry import MetricRegistry, get_global_registry
 from eva.metrics.utils import direction_for_sub_metric
+from eva.metrics.versioning import _CURRENT_METRIC_VERSION
 from eva.models.config import PipelineType, get_pipeline_type
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
@@ -448,6 +449,9 @@ async def _run_record(self, record_id: str, record_dir: Path) -> RecordMetrics:
         # Create tasks for all metrics
         async def compute_metric(metric: BaseMetric) -> tuple[str, MetricScore]:
             """Compute a single metric and handle errors."""
+            # Each gather() task gets its own contextvar snapshot, so this set is
+            # isolated from sibling/parent tasks — no reset needed.
+            _CURRENT_METRIC_VERSION.set(metric.version)
             try:
                 logger.info(f"[{record_id}] Starting metric: {metric.name}")
                 score = await metric.compute(context)

diff --git a/src/eva/metrics/signatures.py b/src/eva/metrics/signatures.py
@@ -0,0 +1,78 @@
+"""Compute drift signatures for metric classes.
+
+A metric's "signature" captures everything we want to detect changes to:
+ - `version`: the manually-bumped string on the class
+ - `source_hash`: sha256[:12] of `inspect.getsource(cls)` (class body)
+ - `prompt_hash`: sha256[:12] of the unrendered judge prompt template, or
+                  None for non-judge metrics
+
+The drift test compares the current signatures against a checked-in fixture
+and fails if anything changed without an explicit version bump + fixture regen.
+"""
+
+import hashlib
+import inspect
+
+# Importing the metric subpackages forces all concrete metric classes to be
+# registered as BaseMetric subclasses, so walking __subclasses__ finds them.
+import eva.metrics.accuracy  # noqa: F401
+import eva.metrics.diagnostic  # noqa: F401
+import eva.metrics.experience  # noqa: F401
+import eva.metrics.validation  # noqa: F401
+from eva.metrics.base import AudioJudgeMetric, BaseMetric, TextJudgeMetric
+from eva.metrics.versioning import hash_prompt_template
+from eva.utils.prompt_manager import get_prompt_manager
+
+
+def _all_concrete_versioned_metric_classes() -> dict[str, type[BaseMetric]]:
+    """Walk BaseMetric subclasses; return concrete classes that set a version.
+
+    Keyed on class qualname so co-named classes (e.g., the cascade vs S2S
+    variants of `agent_speech_fidelity`) get distinct entries.
+    """
+    result: dict[str, type[BaseMetric]] = {}
+
+    def walk(cls: type) -> None:
+        for sub in cls.__subclasses__():
+            walk(sub)
+            if inspect.isabstract(sub):
+                continue
+            # `version` is None on BaseMetric; only concrete classes that
+            # deliberately set it are participating.
+            if getattr(sub, "version", None) is None:
+                continue
+            result[sub.__qualname__] = sub
+
+    walk(BaseMetric)
+    return result
+
+
+def _source_hash(cls: type) -> str:
+    """sha256[:12] of the class body source."""
+    return hashlib.sha256(inspect.getsource(cls).encode("utf-8")).hexdigest()[:12]
+
+
+def _prompt_hash_for_metric(cls: type[BaseMetric]) -> str | None:
+    """Return the prompt template hash for judge metrics, or None.
+
+    All judge metrics in this codebase use `judge.{name}.user_prompt`.
+    A judge metric without a corresponding template raises KeyError —
+    that's a configuration bug we want surfaced.
+    """
+    if not issubclass(cls, TextJudgeMetric | AudioJudgeMetric):
+        return None
+    template = get_prompt_manager().get_template(f"judge.{cls.name}.user_prompt")
+    return hash_prompt_template(template)
+
+
+def compute_all_metric_signatures() -> dict[str, dict[str, str | None]]:
+    """Return {class_qualname: {version, source_hash, prompt_hash}} for every concrete metric."""
+    out: dict[str, dict[str, str | None]] = {}
+    for qualname, cls in _all_concrete_versioned_metric_classes().items():
+        out[qualname] = {
+            "name": cls.name,
+            "version": cls.version,
+            "source_hash": _source_hash(cls),
+            "prompt_hash": _prompt_hash_for_metric(cls),
+        }
+    return out
diff --git a/src/eva/metrics/validation/conversation_valid_end.py b/src/eva/metrics/validation/conversation_valid_end.py
@@ -14,6 +14,7 @@ class ConversationValidEndMetric(CodeMetric):
     """Binary score: 1.0 when the conversation ended on goodbye OR agent-timeout-on-user-turn; 0.0 otherwise."""
 
     name = "conversation_valid_end"
+    version = "v0.1"
     description = "Validation metric: conversation reached a definitive end state"
     category = "validation"
 

diff --git a/src/eva/metrics/validation/user_behavioral_fidelity.py b/src/eva/metrics/validation/user_behavioral_fidelity.py
@@ -65,6 +65,7 @@ class UserBehavioralFidelityMetric(ConversationTextJudgeMetric):
     """
 
     name = "user_behavioral_fidelity"
+    version = "v0.1"
     description = "Validation metric for simulated user corruption detection"
     category = "validation"
     rating_scale = (0, 1)

diff --git a/src/eva/metrics/validation/user_speech_fidelity.py b/src/eva/metrics/validation/user_speech_fidelity.py
@@ -14,6 +14,7 @@ class UserSpeechFidelityMetric(SpeechFidelityBaseMetric):
     """
 
     name = "user_speech_fidelity"
+    version = "v0.1"
     description = "Audio-based validation of user speech fidelity to the intended text"
     category = "validation"
     role = "user"

diff --git a/src/eva/metrics/versioning.py b/src/eva/metrics/versioning.py
@@ -0,0 +1,24 @@
+"""Per-record version + prompt-hash stamping for MetricScore.
+
+MetricsRunner sets these contextvars around every metric.compute() call.
+The MetricScore Pydantic model has a model_validator that reads them and
+auto-fills the version/prompt_hash fields when unset, so all scores and
+sub-scores built inside that compute() inherit the right values without
+each call site having to thread them through explicitly.
+
+Both contextvars default to None, which means "not currently inside a
+metric compute() call" — that's the state during JSON deserialization
+(loading metrics.json from disk), so existing on-disk values are
+preserved instead of being overwritten with None.
+"""
+
+import hashlib
+from contextvars import ContextVar
+
+_CURRENT_METRIC_VERSION: ContextVar[str | None] = ContextVar("current_metric_version", default=None)
+_CURRENT_PROMPT_HASH: ContextVar[str | None] = ContextVar("current_prompt_hash", default=None)
+
+
+def hash_prompt_template(template: str) -> str:
+    """Return sha256[:12] of an unrendered prompt template string."""
+    return hashlib.sha256(template.encode()).hexdigest()[:12]
diff --git a/src/eva/models/results.py b/src/eva/models/results.py
@@ -4,7 +4,7 @@
 from datetime import datetime
 from typing import Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 
 
 class ErrorDetails(BaseModel):
@@ -94,10 +94,35 @@ class MetricScore(BaseModel):
         False,
         description="True when the metric had no applicable data to score (distinct from errored)",
     )
+    version: str | None = Field(
+        None,
+        description="Metric implementation version (set by the metric class) for tracking which "
+        "computation logic produced this score across partial reruns",
+    )
+    prompt_hash: str | None = Field(
+        None,
+        description="sha256[:12] of the unrendered judge prompt template; None for non-judge metrics. "
+        "Lets us detect prompt edits without relying on the metric author to bump `version`.",
+    )
     sub_metrics: dict[str, "MetricScore"] | None = Field(
         None, description="Optional sub-metric breakdowns, aggregated generically by the runner"
     )
 
+    @model_validator(mode="after")
+    def _auto_stamp_version_and_hash(self) -> "MetricScore":
+        # Only fill if unset, so deserialization from disk preserves historical values
+        # and explicit kwargs (e.g., tests) always win.
+        if self.version is None or self.prompt_hash is None:
+            # Lazy import to avoid circular dependency:
+            # eva.models.results -> eva.metrics -> ... -> eva.metrics.utils -> eva.models.results
+            from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH
+
+            if self.version is None:
+                self.version = _CURRENT_METRIC_VERSION.get()
+            if self.prompt_hash is None:
+                self.prompt_hash = _CURRENT_PROMPT_HASH.get()
+        return self
+
 
 class PassAtKResult(BaseModel):
     """pass@k and pass^k result for a single metric across multiple trials."""