diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 96b94736..2bf5ef82 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,9 +15,22 @@ repos:
       - id: end-of-file-fixer
   - repo: local
     hooks:
+      - id: regen-metric-signatures
+        name: Regenerate metric signatures
+        entry: uv run python scripts/regen_metric_signatures.py
+        language: system
+        pass_filenames: false
+        files: |-
+          (?x:
+            ^configs/prompts/judge\.yaml$
+            |
+            ^scripts/regen_metric_signatures\.py$
+            |
+            ^src/eva/metrics/
+          )
       - id: check-version-bump
         name: Check simulation/metrics version bump
-        entry: python3 scripts/check_version_bump.py
+        entry: uv run python scripts/check_version_bump.py
         language: system
         pass_filenames: false
         always_run: true
diff --git a/scripts/regen_metric_signatures.py b/scripts/regen_metric_signatures.py
new file mode 100644
index 00000000..9c27e69e
--- /dev/null
+++ b/scripts/regen_metric_signatures.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+"""Regenerate tests/fixtures/metric_signatures.json.
+
+Run this after intentionally changing a metric's logic and bumping its
+`version` class attribute (or after editing its judge prompt template).
+The drift test (tests/unit/metrics/test_metric_signatures.py) compares
+the current state against this fixture and fails on any unintended drift.
+
+Usage:
+    python scripts/regen_metric_signatures.py
+"""
+
+import json
+from pathlib import Path
+
+from eva.metrics.signatures import compute_all_metric_signatures
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+FIXTURE_PATH = REPO_ROOT / "tests" / "fixtures" / "metric_signatures.json"
+
+
+def main() -> None:
+    signatures = compute_all_metric_signatures()
+    FIXTURE_PATH.parent.mkdir(parents=True, exist_ok=True)
+    FIXTURE_PATH.write_text(json.dumps(signatures, indent=2, sort_keys=True) + "\n")
+    print(f"Wrote {len(signatures)} metric signatures to {FIXTURE_PATH.relative_to(REPO_ROOT)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity.py b/src/eva/metrics/accuracy/agent_speech_fidelity.py
index a659f7c4..d84179f7 100644
--- a/src/eva/metrics/accuracy/agent_speech_fidelity.py
+++ b/src/eva/metrics/accuracy/agent_speech_fidelity.py
@@ -14,6 +14,7 @@ class AgentSpeechFidelityMetric(SpeechFidelityBaseMetric):
     """
 
     name = "agent_speech_fidelity"
+    version = "v0.1"
     description = "Audio-based evaluation of agent speech fidelity to the intended text"
     category = "accuracy"
     role = "assistant"
diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py
index c6d43bb6..2a9706cf 100644
--- a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py
+++ b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py
@@ -25,6 +25,7 @@ class AgentSpeechFidelityS2SMetric(SpeechFidelityBaseMetric):
     """
 
     name = "agent_speech_fidelity"
+    version = "v0.1"
     description = "Audio-based evaluation of agent entity fidelity for S2S models"
     category = "accuracy"
     role = "assistant"
diff --git a/src/eva/metrics/accuracy/faithfulness.py b/src/eva/metrics/accuracy/faithfulness.py
index 3a85d29e..e6743792 100644
--- a/src/eva/metrics/accuracy/faithfulness.py
+++ b/src/eva/metrics/accuracy/faithfulness.py
@@ -54,6 +54,7 @@ class FaithfulnessJudgeMetric(ConversationTextJudgeMetric):
     """
 
     name = "faithfulness"
+    version = "v0.1"
     description = (
         "LLM judge evaluation of whether the assistant remains faithful to information, policies, and instructions"
     )
diff --git a/src/eva/metrics/accuracy/task_completion.py b/src/eva/metrics/accuracy/task_completion.py
index f1cbe8d0..2ca780b8 100644
--- a/src/eva/metrics/accuracy/task_completion.py
+++ b/src/eva/metrics/accuracy/task_completion.py
@@ -35,6 +35,7 @@ class TaskCompletion(BaseMetric):
     """
 
     name = "task_completion"
+    version = "v0.1"
     description = "Binary task completion via scenario DB state hash comparison"
     category = "accuracy"
     metric_type = MetricType.CODE
diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py
index 45a4a501..7935f589 100644
--- a/src/eva/metrics/base.py
+++ b/src/eva/metrics/base.py
@@ -22,6 +22,7 @@
     resolve_turn_id,
     validate_rating,
 )
+from eva.metrics.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template
 from eva.models.config import PipelineType
 from eva.models.results import MetricScore
 from eva.utils.llm_client import LLMClient
@@ -163,6 +164,9 @@ class BaseMetric(ABC):
     pass_at_k_threshold: float = 0.5  # Normalized score threshold for pass@k pass/fail
     exclude_from_pass_at_k: bool = False  # Set True for metrics not suitable for pass@k
     supported_pipeline_types: frozenset[PipelineType] = frozenset(PipelineType)  # Pipeline types this metric supports
+    # Bump on intentional logic changes; MetricsRunner stamps this onto every MetricScore
+    # produced by compute(). Required on all concrete subclasses — drift test enforces.
+    version: str | None = None
     # Direction of the displayed value (normalized_score if present, else score).
     # Override to False for lower-is-better parent metrics (e.g. latency). Sub-metric
     # direction is derived from the key suffix (see eva.metrics.utils.direction_for_sub_metric).
@@ -179,8 +183,13 @@ def __init__(self, config: dict[str, Any] | None = None):
         self.prompt_manager = get_prompt_manager()
 
     def get_judge_prompt(self, prompt_key: str = "user_prompt", **variables) -> str:
-        """Get judge prompt using PromptManager."""
+        """Get judge prompt using PromptManager.
+
+        Stamps the unrendered template's sha256[:12] into the prompt-hash contextvar so
+        any MetricScore built afterwards in the same compute() picks it up automatically.
+        """
         prompt_path = f"judge.{self.name}.{prompt_key}"
+        _CURRENT_PROMPT_HASH.set(hash_prompt_template(self.prompt_manager.get_template(prompt_path)))
         return self.prompt_manager.get_prompt(prompt_path, **variables)
 
     @abstractmethod
diff --git a/src/eva/metrics/diagnostic/authentication_success.py b/src/eva/metrics/diagnostic/authentication_success.py
index 80f93905..0662231b 100644
--- a/src/eva/metrics/diagnostic/authentication_success.py
+++ b/src/eva/metrics/diagnostic/authentication_success.py
@@ -45,6 +45,7 @@ class AuthenticationSuccessMetric(CodeMetric):
     """
 
     name = "authentication_success"
+    version = "v0.1"
     description = "Checks if session state in final DB is a superset of expected session"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/conversation_correctly_finished.py b/src/eva/metrics/diagnostic/conversation_correctly_finished.py
index 6ce5cc49..c38adb7d 100644
--- a/src/eva/metrics/diagnostic/conversation_correctly_finished.py
+++ b/src/eva/metrics/diagnostic/conversation_correctly_finished.py
@@ -11,6 +11,7 @@ class ConversationCorrectlyFinishedMetric(CodeMetric):
     """0.0 when the agent timed out on the user's final turn; 1.0 otherwise."""
 
     name = "conversation_correctly_finished"
+    version = "v0.1"
     description = "Diagnostic metric: 0.0 when agent failed to respond to the user's final turn"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
index a7724b07..56571e2c 100644
--- a/src/eva/metrics/diagnostic/response_speed.py
+++ b/src/eva/metrics/diagnostic/response_speed.py
@@ -60,6 +60,7 @@ class ResponseSpeedMetric(CodeMetric):
     description = "Diagnostic metric: latency between user utterance end and assistant response start"
     exclude_from_pass_at_k = True
     higher_is_better = False  # Score is latency in seconds — lower is better.
+    version = "v0.1"
 
     async def compute(self, context: MetricContext) -> MetricScore:
         try:
diff --git a/src/eva/metrics/diagnostic/speakability.py b/src/eva/metrics/diagnostic/speakability.py
index bc9d8a45..f3e9c3b4 100644
--- a/src/eva/metrics/diagnostic/speakability.py
+++ b/src/eva/metrics/diagnostic/speakability.py
@@ -30,6 +30,7 @@ class SpeakabilityJudgeMetric(PerTurnConversationJudgeMetric):
     """
 
     name = "speakability"
+    version = "v0.1"
     description = "Debug metric: LLM judge evaluation of text voice-friendliness per turn"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/stt_wer.py b/src/eva/metrics/diagnostic/stt_wer.py
index 52ad5c2b..5c52e50c 100644
--- a/src/eva/metrics/diagnostic/stt_wer.py
+++ b/src/eva/metrics/diagnostic/stt_wer.py
@@ -72,6 +72,7 @@ class STTWERMetric(CodeMetric):
     """
 
     name = "stt_wer"
+    version = "v0.1"
     description = "Debug metric: Speech-to-Text transcription accuracy using Word Error Rate"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/tool_call_validity.py b/src/eva/metrics/diagnostic/tool_call_validity.py
index 12b4f7d8..01cfde95 100644
--- a/src/eva/metrics/diagnostic/tool_call_validity.py
+++ b/src/eva/metrics/diagnostic/tool_call_validity.py
@@ -39,6 +39,7 @@ class ToolCallValidity(CodeMetric):
     """
 
     name = "tool_call_validity"
+    version = "v0.1"
     description = "Debug metric: fraction of tool calls with correctly formatted parameters"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py
index ecdb79bf..4990b579 100644
--- a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py
+++ b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py
@@ -48,6 +48,7 @@ class TranscriptionAccuracyKeyEntitiesMetric(TextJudgeMetric):
     """
 
     name = "transcription_accuracy_key_entities"
+    version = "v0.1"
     description = "Debug metric: LLM judge evaluation of STT key entity transcription accuracy for entire conversation"
     category = "diagnostic"
     exclude_from_pass_at_k = True
diff --git a/src/eva/metrics/experience/conciseness.py b/src/eva/metrics/experience/conciseness.py
index 3f77890e..2b4bc196 100644
--- a/src/eva/metrics/experience/conciseness.py
+++ b/src/eva/metrics/experience/conciseness.py
@@ -28,6 +28,7 @@ class ConcisenessJudgeMetric(PerTurnConversationJudgeMetric):
     """
 
     name = "conciseness"
+    version = "v0.1"
     description = "LLM judge evaluation of assistant response conciseness"
     category = "experience"
     rating_scale = (1, 3)
diff --git a/src/eva/metrics/experience/conversation_progression.py b/src/eva/metrics/experience/conversation_progression.py
index 595101e4..f5abfa80 100644
--- a/src/eva/metrics/experience/conversation_progression.py
+++ b/src/eva/metrics/experience/conversation_progression.py
@@ -32,6 +32,7 @@ class ConversationProgressionJudgeMetric(ConversationTextJudgeMetric):
     """
 
     name = "conversation_progression"
+    version = "v0.1"
     description = "LLM judge evaluation of whether the assistant moved the conversation forward productively"
     category = "experience"
     rating_scale = (1, 3)
diff --git a/src/eva/metrics/experience/turn_taking.py b/src/eva/metrics/experience/turn_taking.py
index 08034584..7e3da258 100644
--- a/src/eva/metrics/experience/turn_taking.py
+++ b/src/eva/metrics/experience/turn_taking.py
@@ -58,6 +58,7 @@ class TurnTakingMetric(CodeMetric):
     description = "Turn-taking evaluation based on per-turn latency and interruption behavior"
     category = "experience"
     pass_at_k_threshold = 0.8
+    version = "v0.1"
 
     # --- Latency curve (piecewise linear). 0 outside [LATENCY_HARD_EARLY_MS, LATENCY_HARD_LATE_MS]. ---
     # Ramp up 0 → 1 from LATENCY_HARD_EARLY_MS to LATENCY_SWEET_SPOT_LOW_MS.
diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
index f7d945c5..2744db52 100644
--- a/src/eva/metrics/runner.py
+++ b/src/eva/metrics/runner.py
@@ -16,6 +16,7 @@
 from eva.metrics.processor import MetricsContextProcessor
 from eva.metrics.registry import MetricRegistry, get_global_registry
 from eva.metrics.utils import direction_for_sub_metric
+from eva.metrics.versioning import _CURRENT_METRIC_VERSION
 from eva.models.config import PipelineType, get_pipeline_type
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
@@ -448,6 +449,9 @@ async def _run_record(self, record_id: str, record_dir: Path) -> RecordMetrics:
         # Create tasks for all metrics
         async def compute_metric(metric: BaseMetric) -> tuple[str, MetricScore]:
             """Compute a single metric and handle errors."""
+            # Each gather() task gets its own contextvar snapshot, so this set is
+            # isolated from sibling/parent tasks — no reset needed.
+            _CURRENT_METRIC_VERSION.set(metric.version)
             try:
                 logger.info(f"[{record_id}] Starting metric: {metric.name}")
                 score = await metric.compute(context)
diff --git a/src/eva/metrics/signatures.py b/src/eva/metrics/signatures.py
new file mode 100644
index 00000000..f7ca1f36
--- /dev/null
+++ b/src/eva/metrics/signatures.py
@@ -0,0 +1,78 @@
+"""Compute drift signatures for metric classes.
+
+A metric's "signature" captures everything we want to detect changes to:
+ - `version`: the manually-bumped string on the class
+ - `source_hash`: sha256[:12] of `inspect.getsource(cls)` (class body)
+ - `prompt_hash`: sha256[:12] of the unrendered judge prompt template, or
+                  None for non-judge metrics
+
+The drift test compares the current signatures against a checked-in fixture
+and fails if anything changed without an explicit version bump + fixture regen.
+"""
+
+import hashlib
+import inspect
+
+# Importing the metric subpackages forces all concrete metric classes to be
+# registered as BaseMetric subclasses, so walking __subclasses__ finds them.
+import eva.metrics.accuracy  # noqa: F401
+import eva.metrics.diagnostic  # noqa: F401
+import eva.metrics.experience  # noqa: F401
+import eva.metrics.validation  # noqa: F401
+from eva.metrics.base import AudioJudgeMetric, BaseMetric, TextJudgeMetric
+from eva.metrics.versioning import hash_prompt_template
+from eva.utils.prompt_manager import get_prompt_manager
+
+
+def _all_concrete_versioned_metric_classes() -> dict[str, type[BaseMetric]]:
+    """Walk BaseMetric subclasses; return concrete classes that set a version.
+
+    Keyed on class qualname so co-named classes (e.g., the cascade vs S2S
+    variants of `agent_speech_fidelity`) get distinct entries.
+    """
+    result: dict[str, type[BaseMetric]] = {}
+
+    def walk(cls: type) -> None:
+        for sub in cls.__subclasses__():
+            walk(sub)
+            if inspect.isabstract(sub):
+                continue
+            # `version` is None on BaseMetric; only concrete classes that
+            # deliberately set it are participating.
+            if getattr(sub, "version", None) is None:
+                continue
+            result[sub.__qualname__] = sub
+
+    walk(BaseMetric)
+    return result
+
+
+def _source_hash(cls: type) -> str:
+    """sha256[:12] of the class body source."""
+    return hashlib.sha256(inspect.getsource(cls).encode("utf-8")).hexdigest()[:12]
+
+
+def _prompt_hash_for_metric(cls: type[BaseMetric]) -> str | None:
+    """Return the prompt template hash for judge metrics, or None.
+
+    All judge metrics in this codebase use `judge.{name}.user_prompt`.
+    A judge metric without a corresponding template raises KeyError —
+    that's a configuration bug we want surfaced.
+    """
+    if not issubclass(cls, TextJudgeMetric | AudioJudgeMetric):
+        return None
+    template = get_prompt_manager().get_template(f"judge.{cls.name}.user_prompt")
+    return hash_prompt_template(template)
+
+
+def compute_all_metric_signatures() -> dict[str, dict[str, str | None]]:
+    """Return {class_qualname: {version, source_hash, prompt_hash}} for every concrete metric."""
+    out: dict[str, dict[str, str | None]] = {}
+    for qualname, cls in _all_concrete_versioned_metric_classes().items():
+        out[qualname] = {
+            "name": cls.name,
+            "version": cls.version,
+            "source_hash": _source_hash(cls),
+            "prompt_hash": _prompt_hash_for_metric(cls),
+        }
+    return out
diff --git a/src/eva/metrics/validation/conversation_valid_end.py b/src/eva/metrics/validation/conversation_valid_end.py
index 61831f6f..f6a6ebc5 100644
--- a/src/eva/metrics/validation/conversation_valid_end.py
+++ b/src/eva/metrics/validation/conversation_valid_end.py
@@ -14,6 +14,7 @@ class ConversationValidEndMetric(CodeMetric):
     """Binary score: 1.0 when the conversation ended on goodbye OR agent-timeout-on-user-turn; 0.0 otherwise."""
 
     name = "conversation_valid_end"
+    version = "v0.1"
     description = "Validation metric: conversation reached a definitive end state"
     category = "validation"
 
diff --git a/src/eva/metrics/validation/user_behavioral_fidelity.py b/src/eva/metrics/validation/user_behavioral_fidelity.py
index 56bd4da4..0af13816 100644
--- a/src/eva/metrics/validation/user_behavioral_fidelity.py
+++ b/src/eva/metrics/validation/user_behavioral_fidelity.py
@@ -65,6 +65,7 @@ class UserBehavioralFidelityMetric(ConversationTextJudgeMetric):
     """
 
     name = "user_behavioral_fidelity"
+    version = "v0.1"
     description = "Validation metric for simulated user corruption detection"
     category = "validation"
     rating_scale = (0, 1)
diff --git a/src/eva/metrics/validation/user_speech_fidelity.py b/src/eva/metrics/validation/user_speech_fidelity.py
index 3f605d32..a11fc8a6 100644
--- a/src/eva/metrics/validation/user_speech_fidelity.py
+++ b/src/eva/metrics/validation/user_speech_fidelity.py
@@ -14,6 +14,7 @@ class UserSpeechFidelityMetric(SpeechFidelityBaseMetric):
     """
 
     name = "user_speech_fidelity"
+    version = "v0.1"
     description = "Audio-based validation of user speech fidelity to the intended text"
     category = "validation"
     role = "user"
diff --git a/src/eva/metrics/versioning.py b/src/eva/metrics/versioning.py
new file mode 100644
index 00000000..88908a94
--- /dev/null
+++ b/src/eva/metrics/versioning.py
@@ -0,0 +1,24 @@
+"""Per-record version + prompt-hash stamping for MetricScore.
+
+MetricsRunner sets these contextvars around every metric.compute() call.
+The MetricScore Pydantic model has a model_validator that reads them and
+auto-fills the version/prompt_hash fields when unset, so all scores and
+sub-scores built inside that compute() inherit the right values without
+each call site having to thread them through explicitly.
+
+Both contextvars default to None, which means "not currently inside a
+metric compute() call" — that's the state during JSON deserialization
+(loading metrics.json from disk), so existing on-disk values are
+preserved instead of being overwritten with None.
+"""
+
+import hashlib
+from contextvars import ContextVar
+
+_CURRENT_METRIC_VERSION: ContextVar[str | None] = ContextVar("current_metric_version", default=None)
+_CURRENT_PROMPT_HASH: ContextVar[str | None] = ContextVar("current_prompt_hash", default=None)
+
+
+def hash_prompt_template(template: str) -> str:
+    """Return sha256[:12] of an unrendered prompt template string."""
+    return hashlib.sha256(template.encode()).hexdigest()[:12]
diff --git a/src/eva/models/results.py b/src/eva/models/results.py
index 993184db..0caa7ab6 100644
--- a/src/eva/models/results.py
+++ b/src/eva/models/results.py
@@ -4,7 +4,7 @@
 from datetime import datetime
 from typing import Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 
 
 class ErrorDetails(BaseModel):
@@ -94,10 +94,35 @@ class MetricScore(BaseModel):
         False,
         description="True when the metric had no applicable data to score (distinct from errored)",
     )
+    version: str | None = Field(
+        None,
+        description="Metric implementation version (set by the metric class) for tracking which "
+        "computation logic produced this score across partial reruns",
+    )
+    prompt_hash: str | None = Field(
+        None,
+        description="sha256[:12] of the unrendered judge prompt template; None for non-judge metrics. "
+        "Lets us detect prompt edits without relying on the metric author to bump `version`.",
+    )
     sub_metrics: dict[str, "MetricScore"] | None = Field(
         None, description="Optional sub-metric breakdowns, aggregated generically by the runner"
     )
 
+    @model_validator(mode="after")
+    def _auto_stamp_version_and_hash(self) -> "MetricScore":
+        # Only fill if unset, so deserialization from disk preserves historical values
+        # and explicit kwargs (e.g., tests) always win.
+        if self.version is None or self.prompt_hash is None:
+            # Lazy import to avoid circular dependency:
+            # eva.models.results -> eva.metrics -> ... -> eva.metrics.utils -> eva.models.results
+            from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH
+
+            if self.version is None:
+                self.version = _CURRENT_METRIC_VERSION.get()
+            if self.prompt_hash is None:
+                self.prompt_hash = _CURRENT_PROMPT_HASH.get()
+        return self
+
 
 class PassAtKResult(BaseModel):
     """pass@k and pass^k result for a single metric across multiple trials."""
diff --git a/src/eva/utils/prompt_manager.py b/src/eva/utils/prompt_manager.py
index c67db941..1060c41e 100644
--- a/src/eva/utils/prompt_manager.py
+++ b/src/eva/utils/prompt_manager.py
@@ -81,19 +81,14 @@ def _load_from_directory(self, directory: Path) -> None:
         for yaml_file in yaml_files:
             self._load_single_file(yaml_file)
 
-    def get_prompt(self, path: str, **variables) -> str:
-        """Get a prompt by its path and substitute variables.
+    def get_template(self, path: str) -> str:
+        """Return the unrendered prompt template at `path` (no variable substitution).
 
         Args:
             path: Dot-separated path to the prompt (e.g., "orchestrator.system_prompt")
-            **variables: Variable values to substitute in the prompt
 
-        Returns:
-            The prompt with variables substituted
-
-        Raises:
-            KeyError: If the prompt path is not found
-            ValueError: If the prompt is not a string
+        Used as is for hashing prompt templates so we can detect prompt edits across
+        runs without the per-record variable substitutions changing the hash.
         """
         # Navigate to the prompt using the dot-separated path
         parts = path.split(".")
@@ -109,6 +104,24 @@ def get_prompt(self, path: str, **variables) -> str:
         if not isinstance(value, str):
             raise ValueError(f"Prompt at {path} is not a string: {type(value)}")
 
+        return value
+
+    def get_prompt(self, path: str, **variables) -> str:
+        """Get a prompt by its path and substitute variables.
+
+        Args:
+            path: Dot-separated path to the prompt (e.g., "orchestrator.system_prompt")
+            **variables: Variable values to substitute in the prompt
+
+        Returns:
+            The prompt with variables substituted
+
+        Raises:
+            KeyError: If the prompt path is not found
+            ValueError: If the prompt is not a string
+        """
+        value = self.get_template(path)
+
         # Substitute variables using str.format()
         # Auto-inject global variables from the _shared section (prompt-level vars take precedence)
         shared = self.prompts.get("_shared", {})
diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json
new file mode 100644
index 00000000..fc79f392
--- /dev/null
+++ b/tests/fixtures/metric_signatures.json
@@ -0,0 +1,104 @@
+{
+  "AgentSpeechFidelityMetric": {
+    "name": "agent_speech_fidelity",
+    "prompt_hash": "864be78919d2",
+    "source_hash": "77743114e9b0",
+    "version": "v0.1"
+  },
+  "AgentSpeechFidelityS2SMetric": {
+    "name": "agent_speech_fidelity",
+    "prompt_hash": "864be78919d2",
+    "source_hash": "5b3deb4968cd",
+    "version": "v0.1"
+  },
+  "AuthenticationSuccessMetric": {
+    "name": "authentication_success",
+    "prompt_hash": null,
+    "source_hash": "cdc7c59d2684",
+    "version": "v0.1"
+  },
+  "ConcisenessJudgeMetric": {
+    "name": "conciseness",
+    "prompt_hash": "5d033338d36a",
+    "source_hash": "cd0ea09a9613",
+    "version": "v0.1"
+  },
+  "ConversationCorrectlyFinishedMetric": {
+    "name": "conversation_correctly_finished",
+    "prompt_hash": null,
+    "source_hash": "4f27cacab7d2",
+    "version": "v0.1"
+  },
+  "ConversationProgressionJudgeMetric": {
+    "name": "conversation_progression",
+    "prompt_hash": "f3240185faf6",
+    "source_hash": "91b71c803d77",
+    "version": "v0.1"
+  },
+  "ConversationValidEndMetric": {
+    "name": "conversation_valid_end",
+    "prompt_hash": null,
+    "source_hash": "02d1c3fb330b",
+    "version": "v0.1"
+  },
+  "FaithfulnessJudgeMetric": {
+    "name": "faithfulness",
+    "prompt_hash": "1add7d47362c",
+    "source_hash": "8e3fd6bc9960",
+    "version": "v0.1"
+  },
+  "ResponseSpeedMetric": {
+    "name": "response_speed",
+    "prompt_hash": null,
+    "source_hash": "ebce1a06bf30",
+    "version": "v0.1"
+  },
+  "STTWERMetric": {
+    "name": "stt_wer",
+    "prompt_hash": null,
+    "source_hash": "01fcfbc1cf21",
+    "version": "v0.1"
+  },
+  "SpeakabilityJudgeMetric": {
+    "name": "speakability",
+    "prompt_hash": "cd2cc44fc96c",
+    "source_hash": "187ddf9dc0da",
+    "version": "v0.1"
+  },
+  "TaskCompletion": {
+    "name": "task_completion",
+    "prompt_hash": null,
+    "source_hash": "01aed1a552f4",
+    "version": "v0.1"
+  },
+  "ToolCallValidity": {
+    "name": "tool_call_validity",
+    "prompt_hash": null,
+    "source_hash": "1572b16592fe",
+    "version": "v0.1"
+  },
+  "TranscriptionAccuracyKeyEntitiesMetric": {
+    "name": "transcription_accuracy_key_entities",
+    "prompt_hash": "c0980ff2168d",
+    "source_hash": "a83a699d0fda",
+    "version": "v0.1"
+  },
+  "TurnTakingMetric": {
+    "name": "turn_taking",
+    "prompt_hash": null,
+    "source_hash": "aa574674ff83",
+    "version": "v0.1"
+  },
+  "UserBehavioralFidelityMetric": {
+    "name": "user_behavioral_fidelity",
+    "prompt_hash": "06477144c28e",
+    "source_hash": "af8144bd7731",
+    "version": "v0.1"
+  },
+  "UserSpeechFidelityMetric": {
+    "name": "user_speech_fidelity",
+    "prompt_hash": "c4d97e36b865",
+    "source_hash": "e38e8c162b3d",
+    "version": "v0.1"
+  }
+}
diff --git a/tests/unit/metrics/test_metric_signatures.py b/tests/unit/metrics/test_metric_signatures.py
new file mode 100644
index 00000000..24ce61c5
--- /dev/null
+++ b/tests/unit/metrics/test_metric_signatures.py
@@ -0,0 +1,83 @@
+"""Drift test: fail when a metric's source or prompt changes without a version bump.
+
+Each concrete metric class has three signature fields:
+  - version: manually bumped string on the class
+  - source_hash: sha256[:12] of inspect.getsource(cls)
+  - prompt_hash: sha256[:12] of judge.{name}.user_prompt template (None for code metrics)
+
+The fixture at tests/fixtures/metric_signatures.json is the source of truth for
+the *currently-released* state. The test compares the current signatures to it
+and reports drift. Authors update the fixture by running
+`python scripts/regen_metric_signatures.py` after bumping `version`.
+
+Failure modes the test catches:
+  - source_hash changed, version unchanged → "bump version then regen fixture"
+  - prompt_hash changed, version unchanged → "bump version then regen fixture"
+  - version bumped → "regen fixture" (caught when source/prompt also still drift)
+  - new metric class with no fixture entry → "add to fixture via regen"
+  - metric removed from fixture → "delete from fixture via regen"
+"""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from eva.metrics.signatures import compute_all_metric_signatures
+
+FIXTURE_PATH = Path(__file__).resolve().parents[3] / "tests" / "fixtures" / "metric_signatures.json"
+REGEN_HINT = "Run `python scripts/regen_metric_signatures.py` after bumping `version` on affected classes."
+
+
+@pytest.fixture(scope="module")
+def fixture_signatures() -> dict[str, dict[str, str | None]]:
+    return json.loads(FIXTURE_PATH.read_text())
+
+
+@pytest.fixture(scope="module")
+def current_signatures() -> dict[str, dict[str, str | None]]:
+    return compute_all_metric_signatures()
+
+
+def test_no_unannounced_metric_drift(
+    fixture_signatures: dict[str, dict[str, str | None]],
+    current_signatures: dict[str, dict[str, str | None]],
+) -> None:
+    """Fail if any metric's source/prompt changed without its version being bumped."""
+    failures: list[str] = []
+
+    for qualname, current in current_signatures.items():
+        recorded = fixture_signatures.get(qualname)
+        if recorded is None:
+            failures.append(f"{qualname}: new metric class not in fixture. {REGEN_HINT}")
+            continue
+
+        version_changed = current["version"] != recorded["version"]
+        source_changed = current["source_hash"] != recorded["source_hash"]
+        prompt_changed = current["prompt_hash"] != recorded["prompt_hash"]
+
+        if not (source_changed or prompt_changed or version_changed):
+            continue  # fully in sync
+
+        if version_changed:
+            # Author bumped version; they still need to regen the fixture so
+            # future drift is detected against the new baseline.
+            failures.append(f"{qualname}: version bumped ({recorded['version']} → {current['version']}). {REGEN_HINT}")
+            continue
+
+        # Code or prompt changed but version is unchanged — the case the test
+        # exists to catch.
+        what = []
+        if source_changed:
+            what.append(f"source ({recorded['source_hash']} → {current['source_hash']})")
+        if prompt_changed:
+            what.append(f"prompt ({recorded['prompt_hash']} → {current['prompt_hash']})")
+        failures.append(
+            f"{qualname}: {' and '.join(what)} changed but version still {current['version']!r}. "
+            f"Bump `version` on the class, then run regen."
+        )
+
+    for qualname in fixture_signatures.keys() - current_signatures.keys():
+        failures.append(f"{qualname}: removed from code but still in fixture. {REGEN_HINT}")
+
+    assert not failures, "Metric signature drift detected:\n  " + "\n  ".join(failures)
diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py
index d1d587ad..fec32d0d 100644
--- a/tests/unit/metrics/test_runner.py
+++ b/tests/unit/metrics/test_runner.py
@@ -615,6 +615,96 @@ async def test_rerun_partial_success(self, tmp_path):
         assert result.metrics["m_a"].error is None
         assert result.metrics["m_b"].error == "still failing"
 
+    @pytest.mark.asyncio
+    async def test_rerun_preserves_version_of_non_rerun_metrics(self, tmp_path):
+        """Non-rerun metrics keep their on-disk version; only the rerun metric gets the new version.
+
+        Models the workflow where a user bumps `m_b`'s class version (e.g., to v0.2) and
+        then reruns only the errored records for m_b. m_a wasn't touched, so its on-disk
+        version must survive untouched.
+        """
+        run_dir = _setup_run_dir(tmp_path, ["rec-0"])
+        records = [_make_record("rec-0")]
+        record_dir = run_dir / "records" / "rec-0"
+
+        # On disk: m_a succeeded at v0.1, m_b failed at v0.1 (and we want to rerun m_b)
+        _write_metrics_json(
+            record_dir,
+            "rec-0",
+            {
+                "m_a": MetricScore(name="m_a", score=0.9, normalized_score=0.9, version="v0.1"),
+                "m_b": MetricScore(name="m_b", score=0.0, normalized_score=0.0, error="fail-b", version="v0.1"),
+            },
+        )
+
+        runner = _make_runner(
+            run_dir,
+            records,
+            ["m_a", "m_b"],
+            record_metric_filter={"rec-0": {"m_b"}},
+        )
+        # Simulate that m_b's class version was bumped to v0.2 between runs
+        _install_mock(
+            runner,
+            {
+                "rec-0": {
+                    "m_b": MetricScore(name="m_b", score=0.7, normalized_score=0.7, version="v0.2"),
+                },
+            },
+        )
+
+        result = await runner.run_and_save_record("rec-0", record_dir)
+
+        # In-memory result preserves on-disk version for m_a, stamps new version on m_b
+        assert result.metrics["m_a"].version == "v0.1", "m_a version must not change on partial rerun"
+        assert result.metrics["m_b"].version == "v0.2", "m_b should be re-stamped with the new version"
+
+        # Persisted to disk identically
+        on_disk = json.loads((record_dir / "metrics.json").read_text())["metrics"]
+        assert on_disk["m_a"]["version"] == "v0.1"
+        assert on_disk["m_b"]["version"] == "v0.2"
+
+    @pytest.mark.asyncio
+    async def test_rerun_preserves_legacy_unversioned_metrics(self, tmp_path):
+        """Pre-versioning rows (no `version` on disk) stay `version=None` after a partial rerun."""
+        run_dir = _setup_run_dir(tmp_path, ["rec-0"])
+        records = [_make_record("rec-0")]
+        record_dir = run_dir / "records" / "rec-0"
+
+        # Write a metrics.json by hand without the version field (pre-versioning format)
+        legacy_blob = {
+            "record_id": "rec-0",
+            "metrics": {
+                "m_a": {"name": "m_a", "score": 0.5, "normalized_score": 0.5, "details": {}},
+                "m_b": {
+                    "name": "m_b",
+                    "score": 0.0,
+                    "normalized_score": 0.0,
+                    "error": "fail-b",
+                    "details": {},
+                },
+            },
+        }
+        (record_dir / "metrics.json").write_text(json.dumps(legacy_blob))
+
+        runner = _make_runner(
+            run_dir,
+            records,
+            ["m_a", "m_b"],
+            record_metric_filter={"rec-0": {"m_b"}},
+        )
+        _install_mock(
+            runner,
+            {"rec-0": {"m_b": MetricScore(name="m_b", score=0.7, normalized_score=0.7, version="v0.2")}},
+        )
+
+        result = await runner.run_and_save_record("rec-0", record_dir)
+
+        # m_a was not rerun, so its legacy unversioned state must survive
+        assert result.metrics["m_a"].version is None
+        # m_b was rerun with the bumped class version
+        assert result.metrics["m_b"].version == "v0.2"
+
     @pytest.mark.asyncio
     async def test_same_metric_fails_on_two_records(self, tmp_path):
         """The same metric failing on two different records is rerun independently on each."""