diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 96b94736..2bf5ef82 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,9 +15,22 @@ repos: - id: end-of-file-fixer - repo: local hooks: + - id: regen-metric-signatures + name: Regenerate metric signatures + entry: uv run python scripts/regen_metric_signatures.py + language: system + pass_filenames: false + files: |- + (?x: + ^configs/prompts/judge\.yaml$ + | + ^scripts/regen_metric_signatures\.py$ + | + ^src/eva/metrics/ + ) - id: check-version-bump name: Check simulation/metrics version bump - entry: python3 scripts/check_version_bump.py + entry: uv run python scripts/check_version_bump.py language: system pass_filenames: false always_run: true diff --git a/scripts/regen_metric_signatures.py b/scripts/regen_metric_signatures.py new file mode 100644 index 00000000..9c27e69e --- /dev/null +++ b/scripts/regen_metric_signatures.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +"""Regenerate tests/fixtures/metric_signatures.json. + +Run this after intentionally changing a metric's logic and bumping its +`version` class attribute (or after editing its judge prompt template). +The drift test (tests/unit/metrics/test_metric_signatures.py) compares +the current state against this fixture and fails on any unintended drift. + +Usage: + python scripts/regen_metric_signatures.py +""" + +import json +from pathlib import Path + +from eva.metrics.signatures import compute_all_metric_signatures + +REPO_ROOT = Path(__file__).resolve().parent.parent +FIXTURE_PATH = REPO_ROOT / "tests" / "fixtures" / "metric_signatures.json" + + +def main() -> None: + signatures = compute_all_metric_signatures() + FIXTURE_PATH.parent.mkdir(parents=True, exist_ok=True) + FIXTURE_PATH.write_text(json.dumps(signatures, indent=2, sort_keys=True) + "\n") + print(f"Wrote {len(signatures)} metric signatures to {FIXTURE_PATH.relative_to(REPO_ROOT)}") + + +if __name__ == "__main__": + main() diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity.py b/src/eva/metrics/accuracy/agent_speech_fidelity.py index a659f7c4..d84179f7 100644 --- a/src/eva/metrics/accuracy/agent_speech_fidelity.py +++ b/src/eva/metrics/accuracy/agent_speech_fidelity.py @@ -14,6 +14,7 @@ class AgentSpeechFidelityMetric(SpeechFidelityBaseMetric): """ name = "agent_speech_fidelity" + version = "v0.1" description = "Audio-based evaluation of agent speech fidelity to the intended text" category = "accuracy" role = "assistant" diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py index c6d43bb6..2a9706cf 100644 --- a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py +++ b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py @@ -25,6 +25,7 @@ class AgentSpeechFidelityS2SMetric(SpeechFidelityBaseMetric): """ name = "agent_speech_fidelity" + version = "v0.1" description = "Audio-based evaluation of agent entity fidelity for S2S models" category = "accuracy" role = "assistant" diff --git a/src/eva/metrics/accuracy/faithfulness.py b/src/eva/metrics/accuracy/faithfulness.py index 3a85d29e..e6743792 100644 --- a/src/eva/metrics/accuracy/faithfulness.py +++ b/src/eva/metrics/accuracy/faithfulness.py @@ -54,6 +54,7 @@ class FaithfulnessJudgeMetric(ConversationTextJudgeMetric): """ name = "faithfulness" + version = "v0.1" description = ( "LLM judge evaluation of whether the assistant remains faithful to information, policies, and instructions" ) diff --git a/src/eva/metrics/accuracy/task_completion.py b/src/eva/metrics/accuracy/task_completion.py index f1cbe8d0..2ca780b8 100644 --- a/src/eva/metrics/accuracy/task_completion.py +++ b/src/eva/metrics/accuracy/task_completion.py @@ -35,6 +35,7 @@ class TaskCompletion(BaseMetric): """ name = "task_completion" + version = "v0.1" description = "Binary task completion via scenario DB state hash comparison" category = "accuracy" metric_type = MetricType.CODE diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py index 45a4a501..7935f589 100644 --- a/src/eva/metrics/base.py +++ b/src/eva/metrics/base.py @@ -22,6 +22,7 @@ resolve_turn_id, validate_rating, ) +from eva.metrics.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template from eva.models.config import PipelineType from eva.models.results import MetricScore from eva.utils.llm_client import LLMClient @@ -163,6 +164,9 @@ class BaseMetric(ABC): pass_at_k_threshold: float = 0.5 # Normalized score threshold for pass@k pass/fail exclude_from_pass_at_k: bool = False # Set True for metrics not suitable for pass@k supported_pipeline_types: frozenset[PipelineType] = frozenset(PipelineType) # Pipeline types this metric supports + # Bump on intentional logic changes; MetricsRunner stamps this onto every MetricScore + # produced by compute(). Required on all concrete subclasses — drift test enforces. + version: str | None = None # Direction of the displayed value (normalized_score if present, else score). # Override to False for lower-is-better parent metrics (e.g. latency). Sub-metric # direction is derived from the key suffix (see eva.metrics.utils.direction_for_sub_metric). @@ -179,8 +183,13 @@ def __init__(self, config: dict[str, Any] | None = None): self.prompt_manager = get_prompt_manager() def get_judge_prompt(self, prompt_key: str = "user_prompt", **variables) -> str: - """Get judge prompt using PromptManager.""" + """Get judge prompt using PromptManager. + + Stamps the unrendered template's sha256[:12] into the prompt-hash contextvar so + any MetricScore built afterwards in the same compute() picks it up automatically. + """ prompt_path = f"judge.{self.name}.{prompt_key}" + _CURRENT_PROMPT_HASH.set(hash_prompt_template(self.prompt_manager.get_template(prompt_path))) return self.prompt_manager.get_prompt(prompt_path, **variables) @abstractmethod diff --git a/src/eva/metrics/diagnostic/authentication_success.py b/src/eva/metrics/diagnostic/authentication_success.py index 80f93905..0662231b 100644 --- a/src/eva/metrics/diagnostic/authentication_success.py +++ b/src/eva/metrics/diagnostic/authentication_success.py @@ -45,6 +45,7 @@ class AuthenticationSuccessMetric(CodeMetric): """ name = "authentication_success" + version = "v0.1" description = "Checks if session state in final DB is a superset of expected session" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/conversation_correctly_finished.py b/src/eva/metrics/diagnostic/conversation_correctly_finished.py index 6ce5cc49..c38adb7d 100644 --- a/src/eva/metrics/diagnostic/conversation_correctly_finished.py +++ b/src/eva/metrics/diagnostic/conversation_correctly_finished.py @@ -11,6 +11,7 @@ class ConversationCorrectlyFinishedMetric(CodeMetric): """0.0 when the agent timed out on the user's final turn; 1.0 otherwise.""" name = "conversation_correctly_finished" + version = "v0.1" description = "Diagnostic metric: 0.0 when agent failed to respond to the user's final turn" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index a7724b07..56571e2c 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -60,6 +60,7 @@ class ResponseSpeedMetric(CodeMetric): description = "Diagnostic metric: latency between user utterance end and assistant response start" exclude_from_pass_at_k = True higher_is_better = False # Score is latency in seconds — lower is better. + version = "v0.1" async def compute(self, context: MetricContext) -> MetricScore: try: diff --git a/src/eva/metrics/diagnostic/speakability.py b/src/eva/metrics/diagnostic/speakability.py index bc9d8a45..f3e9c3b4 100644 --- a/src/eva/metrics/diagnostic/speakability.py +++ b/src/eva/metrics/diagnostic/speakability.py @@ -30,6 +30,7 @@ class SpeakabilityJudgeMetric(PerTurnConversationJudgeMetric): """ name = "speakability" + version = "v0.1" description = "Debug metric: LLM judge evaluation of text voice-friendliness per turn" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/stt_wer.py b/src/eva/metrics/diagnostic/stt_wer.py index 52ad5c2b..5c52e50c 100644 --- a/src/eva/metrics/diagnostic/stt_wer.py +++ b/src/eva/metrics/diagnostic/stt_wer.py @@ -72,6 +72,7 @@ class STTWERMetric(CodeMetric): """ name = "stt_wer" + version = "v0.1" description = "Debug metric: Speech-to-Text transcription accuracy using Word Error Rate" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/tool_call_validity.py b/src/eva/metrics/diagnostic/tool_call_validity.py index 12b4f7d8..01cfde95 100644 --- a/src/eva/metrics/diagnostic/tool_call_validity.py +++ b/src/eva/metrics/diagnostic/tool_call_validity.py @@ -39,6 +39,7 @@ class ToolCallValidity(CodeMetric): """ name = "tool_call_validity" + version = "v0.1" description = "Debug metric: fraction of tool calls with correctly formatted parameters" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py index ecdb79bf..4990b579 100644 --- a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py +++ b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py @@ -48,6 +48,7 @@ class TranscriptionAccuracyKeyEntitiesMetric(TextJudgeMetric): """ name = "transcription_accuracy_key_entities" + version = "v0.1" description = "Debug metric: LLM judge evaluation of STT key entity transcription accuracy for entire conversation" category = "diagnostic" exclude_from_pass_at_k = True diff --git a/src/eva/metrics/experience/conciseness.py b/src/eva/metrics/experience/conciseness.py index 3f77890e..2b4bc196 100644 --- a/src/eva/metrics/experience/conciseness.py +++ b/src/eva/metrics/experience/conciseness.py @@ -28,6 +28,7 @@ class ConcisenessJudgeMetric(PerTurnConversationJudgeMetric): """ name = "conciseness" + version = "v0.1" description = "LLM judge evaluation of assistant response conciseness" category = "experience" rating_scale = (1, 3) diff --git a/src/eva/metrics/experience/conversation_progression.py b/src/eva/metrics/experience/conversation_progression.py index 595101e4..f5abfa80 100644 --- a/src/eva/metrics/experience/conversation_progression.py +++ b/src/eva/metrics/experience/conversation_progression.py @@ -32,6 +32,7 @@ class ConversationProgressionJudgeMetric(ConversationTextJudgeMetric): """ name = "conversation_progression" + version = "v0.1" description = "LLM judge evaluation of whether the assistant moved the conversation forward productively" category = "experience" rating_scale = (1, 3) diff --git a/src/eva/metrics/experience/turn_taking.py b/src/eva/metrics/experience/turn_taking.py index 08034584..7e3da258 100644 --- a/src/eva/metrics/experience/turn_taking.py +++ b/src/eva/metrics/experience/turn_taking.py @@ -58,6 +58,7 @@ class TurnTakingMetric(CodeMetric): description = "Turn-taking evaluation based on per-turn latency and interruption behavior" category = "experience" pass_at_k_threshold = 0.8 + version = "v0.1" # --- Latency curve (piecewise linear). 0 outside [LATENCY_HARD_EARLY_MS, LATENCY_HARD_LATE_MS]. --- # Ramp up 0 → 1 from LATENCY_HARD_EARLY_MS to LATENCY_SWEET_SPOT_LOW_MS. diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index f7d945c5..2744db52 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -16,6 +16,7 @@ from eva.metrics.processor import MetricsContextProcessor from eva.metrics.registry import MetricRegistry, get_global_registry from eva.metrics.utils import direction_for_sub_metric +from eva.metrics.versioning import _CURRENT_METRIC_VERSION from eva.models.config import PipelineType, get_pipeline_type from eva.models.record import EvaluationRecord from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics @@ -448,6 +449,9 @@ async def _run_record(self, record_id: str, record_dir: Path) -> RecordMetrics: # Create tasks for all metrics async def compute_metric(metric: BaseMetric) -> tuple[str, MetricScore]: """Compute a single metric and handle errors.""" + # Each gather() task gets its own contextvar snapshot, so this set is + # isolated from sibling/parent tasks — no reset needed. + _CURRENT_METRIC_VERSION.set(metric.version) try: logger.info(f"[{record_id}] Starting metric: {metric.name}") score = await metric.compute(context) diff --git a/src/eva/metrics/signatures.py b/src/eva/metrics/signatures.py new file mode 100644 index 00000000..f7ca1f36 --- /dev/null +++ b/src/eva/metrics/signatures.py @@ -0,0 +1,78 @@ +"""Compute drift signatures for metric classes. + +A metric's "signature" captures everything we want to detect changes to: + - `version`: the manually-bumped string on the class + - `source_hash`: sha256[:12] of `inspect.getsource(cls)` (class body) + - `prompt_hash`: sha256[:12] of the unrendered judge prompt template, or + None for non-judge metrics + +The drift test compares the current signatures against a checked-in fixture +and fails if anything changed without an explicit version bump + fixture regen. +""" + +import hashlib +import inspect + +# Importing the metric subpackages forces all concrete metric classes to be +# registered as BaseMetric subclasses, so walking __subclasses__ finds them. +import eva.metrics.accuracy # noqa: F401 +import eva.metrics.diagnostic # noqa: F401 +import eva.metrics.experience # noqa: F401 +import eva.metrics.validation # noqa: F401 +from eva.metrics.base import AudioJudgeMetric, BaseMetric, TextJudgeMetric +from eva.metrics.versioning import hash_prompt_template +from eva.utils.prompt_manager import get_prompt_manager + + +def _all_concrete_versioned_metric_classes() -> dict[str, type[BaseMetric]]: + """Walk BaseMetric subclasses; return concrete classes that set a version. + + Keyed on class qualname so co-named classes (e.g., the cascade vs S2S + variants of `agent_speech_fidelity`) get distinct entries. + """ + result: dict[str, type[BaseMetric]] = {} + + def walk(cls: type) -> None: + for sub in cls.__subclasses__(): + walk(sub) + if inspect.isabstract(sub): + continue + # `version` is None on BaseMetric; only concrete classes that + # deliberately set it are participating. + if getattr(sub, "version", None) is None: + continue + result[sub.__qualname__] = sub + + walk(BaseMetric) + return result + + +def _source_hash(cls: type) -> str: + """sha256[:12] of the class body source.""" + return hashlib.sha256(inspect.getsource(cls).encode("utf-8")).hexdigest()[:12] + + +def _prompt_hash_for_metric(cls: type[BaseMetric]) -> str | None: + """Return the prompt template hash for judge metrics, or None. + + All judge metrics in this codebase use `judge.{name}.user_prompt`. + A judge metric without a corresponding template raises KeyError — + that's a configuration bug we want surfaced. + """ + if not issubclass(cls, TextJudgeMetric | AudioJudgeMetric): + return None + template = get_prompt_manager().get_template(f"judge.{cls.name}.user_prompt") + return hash_prompt_template(template) + + +def compute_all_metric_signatures() -> dict[str, dict[str, str | None]]: + """Return {class_qualname: {version, source_hash, prompt_hash}} for every concrete metric.""" + out: dict[str, dict[str, str | None]] = {} + for qualname, cls in _all_concrete_versioned_metric_classes().items(): + out[qualname] = { + "name": cls.name, + "version": cls.version, + "source_hash": _source_hash(cls), + "prompt_hash": _prompt_hash_for_metric(cls), + } + return out diff --git a/src/eva/metrics/validation/conversation_valid_end.py b/src/eva/metrics/validation/conversation_valid_end.py index 61831f6f..f6a6ebc5 100644 --- a/src/eva/metrics/validation/conversation_valid_end.py +++ b/src/eva/metrics/validation/conversation_valid_end.py @@ -14,6 +14,7 @@ class ConversationValidEndMetric(CodeMetric): """Binary score: 1.0 when the conversation ended on goodbye OR agent-timeout-on-user-turn; 0.0 otherwise.""" name = "conversation_valid_end" + version = "v0.1" description = "Validation metric: conversation reached a definitive end state" category = "validation" diff --git a/src/eva/metrics/validation/user_behavioral_fidelity.py b/src/eva/metrics/validation/user_behavioral_fidelity.py index 56bd4da4..0af13816 100644 --- a/src/eva/metrics/validation/user_behavioral_fidelity.py +++ b/src/eva/metrics/validation/user_behavioral_fidelity.py @@ -65,6 +65,7 @@ class UserBehavioralFidelityMetric(ConversationTextJudgeMetric): """ name = "user_behavioral_fidelity" + version = "v0.1" description = "Validation metric for simulated user corruption detection" category = "validation" rating_scale = (0, 1) diff --git a/src/eva/metrics/validation/user_speech_fidelity.py b/src/eva/metrics/validation/user_speech_fidelity.py index 3f605d32..a11fc8a6 100644 --- a/src/eva/metrics/validation/user_speech_fidelity.py +++ b/src/eva/metrics/validation/user_speech_fidelity.py @@ -14,6 +14,7 @@ class UserSpeechFidelityMetric(SpeechFidelityBaseMetric): """ name = "user_speech_fidelity" + version = "v0.1" description = "Audio-based validation of user speech fidelity to the intended text" category = "validation" role = "user" diff --git a/src/eva/metrics/versioning.py b/src/eva/metrics/versioning.py new file mode 100644 index 00000000..88908a94 --- /dev/null +++ b/src/eva/metrics/versioning.py @@ -0,0 +1,24 @@ +"""Per-record version + prompt-hash stamping for MetricScore. + +MetricsRunner sets these contextvars around every metric.compute() call. +The MetricScore Pydantic model has a model_validator that reads them and +auto-fills the version/prompt_hash fields when unset, so all scores and +sub-scores built inside that compute() inherit the right values without +each call site having to thread them through explicitly. + +Both contextvars default to None, which means "not currently inside a +metric compute() call" — that's the state during JSON deserialization +(loading metrics.json from disk), so existing on-disk values are +preserved instead of being overwritten with None. +""" + +import hashlib +from contextvars import ContextVar + +_CURRENT_METRIC_VERSION: ContextVar[str | None] = ContextVar("current_metric_version", default=None) +_CURRENT_PROMPT_HASH: ContextVar[str | None] = ContextVar("current_prompt_hash", default=None) + + +def hash_prompt_template(template: str) -> str: + """Return sha256[:12] of an unrendered prompt template string.""" + return hashlib.sha256(template.encode()).hexdigest()[:12] diff --git a/src/eva/models/results.py b/src/eva/models/results.py index 993184db..0caa7ab6 100644 --- a/src/eva/models/results.py +++ b/src/eva/models/results.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator class ErrorDetails(BaseModel): @@ -94,10 +94,35 @@ class MetricScore(BaseModel): False, description="True when the metric had no applicable data to score (distinct from errored)", ) + version: str | None = Field( + None, + description="Metric implementation version (set by the metric class) for tracking which " + "computation logic produced this score across partial reruns", + ) + prompt_hash: str | None = Field( + None, + description="sha256[:12] of the unrendered judge prompt template; None for non-judge metrics. " + "Lets us detect prompt edits without relying on the metric author to bump `version`.", + ) sub_metrics: dict[str, "MetricScore"] | None = Field( None, description="Optional sub-metric breakdowns, aggregated generically by the runner" ) + @model_validator(mode="after") + def _auto_stamp_version_and_hash(self) -> "MetricScore": + # Only fill if unset, so deserialization from disk preserves historical values + # and explicit kwargs (e.g., tests) always win. + if self.version is None or self.prompt_hash is None: + # Lazy import to avoid circular dependency: + # eva.models.results -> eva.metrics -> ... -> eva.metrics.utils -> eva.models.results + from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH + + if self.version is None: + self.version = _CURRENT_METRIC_VERSION.get() + if self.prompt_hash is None: + self.prompt_hash = _CURRENT_PROMPT_HASH.get() + return self + class PassAtKResult(BaseModel): """pass@k and pass^k result for a single metric across multiple trials.""" diff --git a/src/eva/utils/prompt_manager.py b/src/eva/utils/prompt_manager.py index c67db941..1060c41e 100644 --- a/src/eva/utils/prompt_manager.py +++ b/src/eva/utils/prompt_manager.py @@ -81,19 +81,14 @@ def _load_from_directory(self, directory: Path) -> None: for yaml_file in yaml_files: self._load_single_file(yaml_file) - def get_prompt(self, path: str, **variables) -> str: - """Get a prompt by its path and substitute variables. + def get_template(self, path: str) -> str: + """Return the unrendered prompt template at `path` (no variable substitution). Args: path: Dot-separated path to the prompt (e.g., "orchestrator.system_prompt") - **variables: Variable values to substitute in the prompt - Returns: - The prompt with variables substituted - - Raises: - KeyError: If the prompt path is not found - ValueError: If the prompt is not a string + Used as is for hashing prompt templates so we can detect prompt edits across + runs without the per-record variable substitutions changing the hash. """ # Navigate to the prompt using the dot-separated path parts = path.split(".") @@ -109,6 +104,24 @@ def get_prompt(self, path: str, **variables) -> str: if not isinstance(value, str): raise ValueError(f"Prompt at {path} is not a string: {type(value)}") + return value + + def get_prompt(self, path: str, **variables) -> str: + """Get a prompt by its path and substitute variables. + + Args: + path: Dot-separated path to the prompt (e.g., "orchestrator.system_prompt") + **variables: Variable values to substitute in the prompt + + Returns: + The prompt with variables substituted + + Raises: + KeyError: If the prompt path is not found + ValueError: If the prompt is not a string + """ + value = self.get_template(path) + # Substitute variables using str.format() # Auto-inject global variables from the _shared section (prompt-level vars take precedence) shared = self.prompts.get("_shared", {}) diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json new file mode 100644 index 00000000..fc79f392 --- /dev/null +++ b/tests/fixtures/metric_signatures.json @@ -0,0 +1,104 @@ +{ + "AgentSpeechFidelityMetric": { + "name": "agent_speech_fidelity", + "prompt_hash": "864be78919d2", + "source_hash": "77743114e9b0", + "version": "v0.1" + }, + "AgentSpeechFidelityS2SMetric": { + "name": "agent_speech_fidelity", + "prompt_hash": "864be78919d2", + "source_hash": "5b3deb4968cd", + "version": "v0.1" + }, + "AuthenticationSuccessMetric": { + "name": "authentication_success", + "prompt_hash": null, + "source_hash": "cdc7c59d2684", + "version": "v0.1" + }, + "ConcisenessJudgeMetric": { + "name": "conciseness", + "prompt_hash": "5d033338d36a", + "source_hash": "cd0ea09a9613", + "version": "v0.1" + }, + "ConversationCorrectlyFinishedMetric": { + "name": "conversation_correctly_finished", + "prompt_hash": null, + "source_hash": "4f27cacab7d2", + "version": "v0.1" + }, + "ConversationProgressionJudgeMetric": { + "name": "conversation_progression", + "prompt_hash": "f3240185faf6", + "source_hash": "91b71c803d77", + "version": "v0.1" + }, + "ConversationValidEndMetric": { + "name": "conversation_valid_end", + "prompt_hash": null, + "source_hash": "02d1c3fb330b", + "version": "v0.1" + }, + "FaithfulnessJudgeMetric": { + "name": "faithfulness", + "prompt_hash": "1add7d47362c", + "source_hash": "8e3fd6bc9960", + "version": "v0.1" + }, + "ResponseSpeedMetric": { + "name": "response_speed", + "prompt_hash": null, + "source_hash": "ebce1a06bf30", + "version": "v0.1" + }, + "STTWERMetric": { + "name": "stt_wer", + "prompt_hash": null, + "source_hash": "01fcfbc1cf21", + "version": "v0.1" + }, + "SpeakabilityJudgeMetric": { + "name": "speakability", + "prompt_hash": "cd2cc44fc96c", + "source_hash": "187ddf9dc0da", + "version": "v0.1" + }, + "TaskCompletion": { + "name": "task_completion", + "prompt_hash": null, + "source_hash": "01aed1a552f4", + "version": "v0.1" + }, + "ToolCallValidity": { + "name": "tool_call_validity", + "prompt_hash": null, + "source_hash": "1572b16592fe", + "version": "v0.1" + }, + "TranscriptionAccuracyKeyEntitiesMetric": { + "name": "transcription_accuracy_key_entities", + "prompt_hash": "c0980ff2168d", + "source_hash": "a83a699d0fda", + "version": "v0.1" + }, + "TurnTakingMetric": { + "name": "turn_taking", + "prompt_hash": null, + "source_hash": "aa574674ff83", + "version": "v0.1" + }, + "UserBehavioralFidelityMetric": { + "name": "user_behavioral_fidelity", + "prompt_hash": "06477144c28e", + "source_hash": "af8144bd7731", + "version": "v0.1" + }, + "UserSpeechFidelityMetric": { + "name": "user_speech_fidelity", + "prompt_hash": "c4d97e36b865", + "source_hash": "e38e8c162b3d", + "version": "v0.1" + } +} diff --git a/tests/unit/metrics/test_metric_signatures.py b/tests/unit/metrics/test_metric_signatures.py new file mode 100644 index 00000000..24ce61c5 --- /dev/null +++ b/tests/unit/metrics/test_metric_signatures.py @@ -0,0 +1,83 @@ +"""Drift test: fail when a metric's source or prompt changes without a version bump. + +Each concrete metric class has three signature fields: + - version: manually bumped string on the class + - source_hash: sha256[:12] of inspect.getsource(cls) + - prompt_hash: sha256[:12] of judge.{name}.user_prompt template (None for code metrics) + +The fixture at tests/fixtures/metric_signatures.json is the source of truth for +the *currently-released* state. The test compares the current signatures to it +and reports drift. Authors update the fixture by running +`python scripts/regen_metric_signatures.py` after bumping `version`. + +Failure modes the test catches: + - source_hash changed, version unchanged → "bump version then regen fixture" + - prompt_hash changed, version unchanged → "bump version then regen fixture" + - version bumped → "regen fixture" (caught when source/prompt also still drift) + - new metric class with no fixture entry → "add to fixture via regen" + - metric removed from fixture → "delete from fixture via regen" +""" + +import json +from pathlib import Path + +import pytest + +from eva.metrics.signatures import compute_all_metric_signatures + +FIXTURE_PATH = Path(__file__).resolve().parents[3] / "tests" / "fixtures" / "metric_signatures.json" +REGEN_HINT = "Run `python scripts/regen_metric_signatures.py` after bumping `version` on affected classes." + + +@pytest.fixture(scope="module") +def fixture_signatures() -> dict[str, dict[str, str | None]]: + return json.loads(FIXTURE_PATH.read_text()) + + +@pytest.fixture(scope="module") +def current_signatures() -> dict[str, dict[str, str | None]]: + return compute_all_metric_signatures() + + +def test_no_unannounced_metric_drift( + fixture_signatures: dict[str, dict[str, str | None]], + current_signatures: dict[str, dict[str, str | None]], +) -> None: + """Fail if any metric's source/prompt changed without its version being bumped.""" + failures: list[str] = [] + + for qualname, current in current_signatures.items(): + recorded = fixture_signatures.get(qualname) + if recorded is None: + failures.append(f"{qualname}: new metric class not in fixture. {REGEN_HINT}") + continue + + version_changed = current["version"] != recorded["version"] + source_changed = current["source_hash"] != recorded["source_hash"] + prompt_changed = current["prompt_hash"] != recorded["prompt_hash"] + + if not (source_changed or prompt_changed or version_changed): + continue # fully in sync + + if version_changed: + # Author bumped version; they still need to regen the fixture so + # future drift is detected against the new baseline. + failures.append(f"{qualname}: version bumped ({recorded['version']} → {current['version']}). {REGEN_HINT}") + continue + + # Code or prompt changed but version is unchanged — the case the test + # exists to catch. + what = [] + if source_changed: + what.append(f"source ({recorded['source_hash']} → {current['source_hash']})") + if prompt_changed: + what.append(f"prompt ({recorded['prompt_hash']} → {current['prompt_hash']})") + failures.append( + f"{qualname}: {' and '.join(what)} changed but version still {current['version']!r}. " + f"Bump `version` on the class, then run regen." + ) + + for qualname in fixture_signatures.keys() - current_signatures.keys(): + failures.append(f"{qualname}: removed from code but still in fixture. {REGEN_HINT}") + + assert not failures, "Metric signature drift detected:\n " + "\n ".join(failures) diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py index d1d587ad..fec32d0d 100644 --- a/tests/unit/metrics/test_runner.py +++ b/tests/unit/metrics/test_runner.py @@ -615,6 +615,96 @@ async def test_rerun_partial_success(self, tmp_path): assert result.metrics["m_a"].error is None assert result.metrics["m_b"].error == "still failing" + @pytest.mark.asyncio + async def test_rerun_preserves_version_of_non_rerun_metrics(self, tmp_path): + """Non-rerun metrics keep their on-disk version; only the rerun metric gets the new version. + + Models the workflow where a user bumps `m_b`'s class version (e.g., to v0.2) and + then reruns only the errored records for m_b. m_a wasn't touched, so its on-disk + version must survive untouched. + """ + run_dir = _setup_run_dir(tmp_path, ["rec-0"]) + records = [_make_record("rec-0")] + record_dir = run_dir / "records" / "rec-0" + + # On disk: m_a succeeded at v0.1, m_b failed at v0.1 (and we want to rerun m_b) + _write_metrics_json( + record_dir, + "rec-0", + { + "m_a": MetricScore(name="m_a", score=0.9, normalized_score=0.9, version="v0.1"), + "m_b": MetricScore(name="m_b", score=0.0, normalized_score=0.0, error="fail-b", version="v0.1"), + }, + ) + + runner = _make_runner( + run_dir, + records, + ["m_a", "m_b"], + record_metric_filter={"rec-0": {"m_b"}}, + ) + # Simulate that m_b's class version was bumped to v0.2 between runs + _install_mock( + runner, + { + "rec-0": { + "m_b": MetricScore(name="m_b", score=0.7, normalized_score=0.7, version="v0.2"), + }, + }, + ) + + result = await runner.run_and_save_record("rec-0", record_dir) + + # In-memory result preserves on-disk version for m_a, stamps new version on m_b + assert result.metrics["m_a"].version == "v0.1", "m_a version must not change on partial rerun" + assert result.metrics["m_b"].version == "v0.2", "m_b should be re-stamped with the new version" + + # Persisted to disk identically + on_disk = json.loads((record_dir / "metrics.json").read_text())["metrics"] + assert on_disk["m_a"]["version"] == "v0.1" + assert on_disk["m_b"]["version"] == "v0.2" + + @pytest.mark.asyncio + async def test_rerun_preserves_legacy_unversioned_metrics(self, tmp_path): + """Pre-versioning rows (no `version` on disk) stay `version=None` after a partial rerun.""" + run_dir = _setup_run_dir(tmp_path, ["rec-0"]) + records = [_make_record("rec-0")] + record_dir = run_dir / "records" / "rec-0" + + # Write a metrics.json by hand without the version field (pre-versioning format) + legacy_blob = { + "record_id": "rec-0", + "metrics": { + "m_a": {"name": "m_a", "score": 0.5, "normalized_score": 0.5, "details": {}}, + "m_b": { + "name": "m_b", + "score": 0.0, + "normalized_score": 0.0, + "error": "fail-b", + "details": {}, + }, + }, + } + (record_dir / "metrics.json").write_text(json.dumps(legacy_blob)) + + runner = _make_runner( + run_dir, + records, + ["m_a", "m_b"], + record_metric_filter={"rec-0": {"m_b"}}, + ) + _install_mock( + runner, + {"rec-0": {"m_b": MetricScore(name="m_b", score=0.7, normalized_score=0.7, version="v0.2")}}, + ) + + result = await runner.run_and_save_record("rec-0", record_dir) + + # m_a was not rerun, so its legacy unversioned state must survive + assert result.metrics["m_a"].version is None + # m_b was rerun with the bumped class version + assert result.metrics["m_b"].version == "v0.2" + @pytest.mark.asyncio async def test_same_metric_fails_on_two_records(self, tmp_path): """The same metric failing on two different records is rerun independently on each."""