Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,22 @@ repos:
- id: end-of-file-fixer
- repo: local
hooks:
- id: regen-metric-signatures
name: Regenerate metric signatures
entry: uv run python scripts/regen_metric_signatures.py
language: system
pass_filenames: false
files: |-
(?x:
^configs/prompts/judge\.yaml$
|
^scripts/regen_metric_signatures\.py$
|
^src/eva/metrics/
)
- id: check-version-bump
name: Check simulation/metrics version bump
entry: python3 scripts/check_version_bump.py
entry: uv run python scripts/check_version_bump.py
language: system
pass_filenames: false
always_run: true
Expand Down
30 changes: 30 additions & 0 deletions scripts/regen_metric_signatures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
"""Regenerate tests/fixtures/metric_signatures.json.

Run this after intentionally changing a metric's logic and bumping its
`version` class attribute (or after editing its judge prompt template).
The drift test (tests/unit/metrics/test_metric_signatures.py) compares
the current state against this fixture and fails on any unintended drift.

Usage:
python scripts/regen_metric_signatures.py
"""

import json
from pathlib import Path

from eva.metrics.signatures import compute_all_metric_signatures

REPO_ROOT = Path(__file__).resolve().parent.parent
FIXTURE_PATH = REPO_ROOT / "tests" / "fixtures" / "metric_signatures.json"


def main() -> None:
signatures = compute_all_metric_signatures()
FIXTURE_PATH.parent.mkdir(parents=True, exist_ok=True)
FIXTURE_PATH.write_text(json.dumps(signatures, indent=2, sort_keys=True) + "\n")
print(f"Wrote {len(signatures)} metric signatures to {FIXTURE_PATH.relative_to(REPO_ROOT)}")


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions src/eva/metrics/accuracy/agent_speech_fidelity.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class AgentSpeechFidelityMetric(SpeechFidelityBaseMetric):
"""

name = "agent_speech_fidelity"
version = "v0.1"
description = "Audio-based evaluation of agent speech fidelity to the intended text"
category = "accuracy"
role = "assistant"
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class AgentSpeechFidelityS2SMetric(SpeechFidelityBaseMetric):
"""

name = "agent_speech_fidelity"
version = "v0.1"
description = "Audio-based evaluation of agent entity fidelity for S2S models"
category = "accuracy"
role = "assistant"
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/accuracy/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class FaithfulnessJudgeMetric(ConversationTextJudgeMetric):
"""

name = "faithfulness"
version = "v0.1"
description = (
"LLM judge evaluation of whether the assistant remains faithful to information, policies, and instructions"
)
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/accuracy/task_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class TaskCompletion(BaseMetric):
"""

name = "task_completion"
version = "v0.1"
description = "Binary task completion via scenario DB state hash comparison"
category = "accuracy"
metric_type = MetricType.CODE
Expand Down
11 changes: 10 additions & 1 deletion src/eva/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
resolve_turn_id,
validate_rating,
)
from eva.metrics.versioning import _CURRENT_PROMPT_HASH, hash_prompt_template
from eva.models.config import PipelineType
from eva.models.results import MetricScore
from eva.utils.llm_client import LLMClient
Expand Down Expand Up @@ -163,6 +164,9 @@ class BaseMetric(ABC):
pass_at_k_threshold: float = 0.5 # Normalized score threshold for pass@k pass/fail
exclude_from_pass_at_k: bool = False # Set True for metrics not suitable for pass@k
supported_pipeline_types: frozenset[PipelineType] = frozenset(PipelineType) # Pipeline types this metric supports
# Bump on intentional logic changes; MetricsRunner stamps this onto every MetricScore
# produced by compute(). Required on all concrete subclasses — drift test enforces.
version: str | None = None
# Direction of the displayed value (normalized_score if present, else score).
# Override to False for lower-is-better parent metrics (e.g. latency). Sub-metric
# direction is derived from the key suffix (see eva.metrics.utils.direction_for_sub_metric).
Expand All @@ -179,8 +183,13 @@ def __init__(self, config: dict[str, Any] | None = None):
self.prompt_manager = get_prompt_manager()

def get_judge_prompt(self, prompt_key: str = "user_prompt", **variables) -> str:
"""Get judge prompt using PromptManager."""
"""Get judge prompt using PromptManager.

Stamps the unrendered template's sha256[:12] into the prompt-hash contextvar so
any MetricScore built afterwards in the same compute() picks it up automatically.
"""
prompt_path = f"judge.{self.name}.{prompt_key}"
_CURRENT_PROMPT_HASH.set(hash_prompt_template(self.prompt_manager.get_template(prompt_path)))
return self.prompt_manager.get_prompt(prompt_path, **variables)

@abstractmethod
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/diagnostic/authentication_success.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class AuthenticationSuccessMetric(CodeMetric):
"""

name = "authentication_success"
version = "v0.1"
description = "Checks if session state in final DB is a superset of expected session"
category = "diagnostic"
exclude_from_pass_at_k = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class ConversationCorrectlyFinishedMetric(CodeMetric):
"""0.0 when the agent timed out on the user's final turn; 1.0 otherwise."""

name = "conversation_correctly_finished"
version = "v0.1"
description = "Diagnostic metric: 0.0 when agent failed to respond to the user's final turn"
category = "diagnostic"
exclude_from_pass_at_k = True
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/diagnostic/response_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class ResponseSpeedMetric(CodeMetric):
description = "Diagnostic metric: latency between user utterance end and assistant response start"
exclude_from_pass_at_k = True
higher_is_better = False # Score is latency in seconds — lower is better.
version = "v0.1"

async def compute(self, context: MetricContext) -> MetricScore:
try:
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/diagnostic/speakability.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class SpeakabilityJudgeMetric(PerTurnConversationJudgeMetric):
"""

name = "speakability"
version = "v0.1"
description = "Debug metric: LLM judge evaluation of text voice-friendliness per turn"
category = "diagnostic"
exclude_from_pass_at_k = True
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/diagnostic/stt_wer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class STTWERMetric(CodeMetric):
"""

name = "stt_wer"
version = "v0.1"
description = "Debug metric: Speech-to-Text transcription accuracy using Word Error Rate"
category = "diagnostic"
exclude_from_pass_at_k = True
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/diagnostic/tool_call_validity.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class ToolCallValidity(CodeMetric):
"""

name = "tool_call_validity"
version = "v0.1"
description = "Debug metric: fraction of tool calls with correctly formatted parameters"
category = "diagnostic"
exclude_from_pass_at_k = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class TranscriptionAccuracyKeyEntitiesMetric(TextJudgeMetric):
"""

name = "transcription_accuracy_key_entities"
version = "v0.1"
description = "Debug metric: LLM judge evaluation of STT key entity transcription accuracy for entire conversation"
category = "diagnostic"
exclude_from_pass_at_k = True
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/experience/conciseness.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class ConcisenessJudgeMetric(PerTurnConversationJudgeMetric):
"""

name = "conciseness"
version = "v0.1"
description = "LLM judge evaluation of assistant response conciseness"
category = "experience"
rating_scale = (1, 3)
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/experience/conversation_progression.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class ConversationProgressionJudgeMetric(ConversationTextJudgeMetric):
"""

name = "conversation_progression"
version = "v0.1"
description = "LLM judge evaluation of whether the assistant moved the conversation forward productively"
category = "experience"
rating_scale = (1, 3)
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/experience/turn_taking.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class TurnTakingMetric(CodeMetric):
description = "Turn-taking evaluation based on per-turn latency and interruption behavior"
category = "experience"
pass_at_k_threshold = 0.8
version = "v0.1"

# --- Latency curve (piecewise linear). 0 outside [LATENCY_HARD_EARLY_MS, LATENCY_HARD_LATE_MS]. ---
# Ramp up 0 → 1 from LATENCY_HARD_EARLY_MS to LATENCY_SWEET_SPOT_LOW_MS.
Expand Down
4 changes: 4 additions & 0 deletions src/eva/metrics/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from eva.metrics.processor import MetricsContextProcessor
from eva.metrics.registry import MetricRegistry, get_global_registry
from eva.metrics.utils import direction_for_sub_metric
from eva.metrics.versioning import _CURRENT_METRIC_VERSION
from eva.models.config import PipelineType, get_pipeline_type
from eva.models.record import EvaluationRecord
from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
Expand Down Expand Up @@ -448,6 +449,9 @@ async def _run_record(self, record_id: str, record_dir: Path) -> RecordMetrics:
# Create tasks for all metrics
async def compute_metric(metric: BaseMetric) -> tuple[str, MetricScore]:
"""Compute a single metric and handle errors."""
# Each gather() task gets its own contextvar snapshot, so this set is
# isolated from sibling/parent tasks — no reset needed.
_CURRENT_METRIC_VERSION.set(metric.version)
try:
logger.info(f"[{record_id}] Starting metric: {metric.name}")
score = await metric.compute(context)
Expand Down
78 changes: 78 additions & 0 deletions src/eva/metrics/signatures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Compute drift signatures for metric classes.

A metric's "signature" captures everything we want to detect changes to:
- `version`: the manually-bumped string on the class
- `source_hash`: sha256[:12] of `inspect.getsource(cls)` (class body)
- `prompt_hash`: sha256[:12] of the unrendered judge prompt template, or
None for non-judge metrics

The drift test compares the current signatures against a checked-in fixture
and fails if anything changed without an explicit version bump + fixture regen.
"""

import hashlib
import inspect

# Importing the metric subpackages forces all concrete metric classes to be
# registered as BaseMetric subclasses, so walking __subclasses__ finds them.
import eva.metrics.accuracy # noqa: F401
import eva.metrics.diagnostic # noqa: F401
import eva.metrics.experience # noqa: F401
import eva.metrics.validation # noqa: F401
from eva.metrics.base import AudioJudgeMetric, BaseMetric, TextJudgeMetric
from eva.metrics.versioning import hash_prompt_template
from eva.utils.prompt_manager import get_prompt_manager


def _all_concrete_versioned_metric_classes() -> dict[str, type[BaseMetric]]:
"""Walk BaseMetric subclasses; return concrete classes that set a version.

Keyed on class qualname so co-named classes (e.g., the cascade vs S2S
variants of `agent_speech_fidelity`) get distinct entries.
"""
result: dict[str, type[BaseMetric]] = {}

def walk(cls: type) -> None:
for sub in cls.__subclasses__():
walk(sub)
if inspect.isabstract(sub):
continue
# `version` is None on BaseMetric; only concrete classes that
# deliberately set it are participating.
if getattr(sub, "version", None) is None:
continue
result[sub.__qualname__] = sub

walk(BaseMetric)
return result


def _source_hash(cls: type) -> str:
"""sha256[:12] of the class body source."""
return hashlib.sha256(inspect.getsource(cls).encode("utf-8")).hexdigest()[:12]


def _prompt_hash_for_metric(cls: type[BaseMetric]) -> str | None:
"""Return the prompt template hash for judge metrics, or None.

All judge metrics in this codebase use `judge.{name}.user_prompt`.
A judge metric without a corresponding template raises KeyError —
that's a configuration bug we want surfaced.
"""
if not issubclass(cls, TextJudgeMetric | AudioJudgeMetric):
return None
template = get_prompt_manager().get_template(f"judge.{cls.name}.user_prompt")
return hash_prompt_template(template)


def compute_all_metric_signatures() -> dict[str, dict[str, str | None]]:
"""Return {class_qualname: {version, source_hash, prompt_hash}} for every concrete metric."""
out: dict[str, dict[str, str | None]] = {}
for qualname, cls in _all_concrete_versioned_metric_classes().items():
out[qualname] = {
"name": cls.name,
"version": cls.version,
"source_hash": _source_hash(cls),
"prompt_hash": _prompt_hash_for_metric(cls),
}
return out
1 change: 1 addition & 0 deletions src/eva/metrics/validation/conversation_valid_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class ConversationValidEndMetric(CodeMetric):
"""Binary score: 1.0 when the conversation ended on goodbye OR agent-timeout-on-user-turn; 0.0 otherwise."""

name = "conversation_valid_end"
version = "v0.1"
description = "Validation metric: conversation reached a definitive end state"
category = "validation"

Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/validation/user_behavioral_fidelity.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class UserBehavioralFidelityMetric(ConversationTextJudgeMetric):
"""

name = "user_behavioral_fidelity"
version = "v0.1"
description = "Validation metric for simulated user corruption detection"
category = "validation"
rating_scale = (0, 1)
Expand Down
1 change: 1 addition & 0 deletions src/eva/metrics/validation/user_speech_fidelity.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class UserSpeechFidelityMetric(SpeechFidelityBaseMetric):
"""

name = "user_speech_fidelity"
version = "v0.1"
description = "Audio-based validation of user speech fidelity to the intended text"
category = "validation"
role = "user"
Expand Down
24 changes: 24 additions & 0 deletions src/eva/metrics/versioning.py
Comment thread
JosephMarinier marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Per-record version + prompt-hash stamping for MetricScore.

MetricsRunner sets these contextvars around every metric.compute() call.
The MetricScore Pydantic model has a model_validator that reads them and
auto-fills the version/prompt_hash fields when unset, so all scores and
sub-scores built inside that compute() inherit the right values without
each call site having to thread them through explicitly.

Both contextvars default to None, which means "not currently inside a
metric compute() call" — that's the state during JSON deserialization
(loading metrics.json from disk), so existing on-disk values are
preserved instead of being overwritten with None.
"""

import hashlib
from contextvars import ContextVar

_CURRENT_METRIC_VERSION: ContextVar[str | None] = ContextVar("current_metric_version", default=None)
_CURRENT_PROMPT_HASH: ContextVar[str | None] = ContextVar("current_prompt_hash", default=None)


def hash_prompt_template(template: str) -> str:
"""Return sha256[:12] of an unrendered prompt template string."""
return hashlib.sha256(template.encode()).hexdigest()[:12]
27 changes: 26 additions & 1 deletion src/eva/models/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime
from typing import Any

from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, model_validator


class ErrorDetails(BaseModel):
Expand Down Expand Up @@ -94,10 +94,35 @@ class MetricScore(BaseModel):
False,
description="True when the metric had no applicable data to score (distinct from errored)",
)
version: str | None = Field(
None,
description="Metric implementation version (set by the metric class) for tracking which "
"computation logic produced this score across partial reruns",
)
prompt_hash: str | None = Field(
None,
description="sha256[:12] of the unrendered judge prompt template; None for non-judge metrics. "
"Lets us detect prompt edits without relying on the metric author to bump `version`.",
)
sub_metrics: dict[str, "MetricScore"] | None = Field(
None, description="Optional sub-metric breakdowns, aggregated generically by the runner"
)

@model_validator(mode="after")
def _auto_stamp_version_and_hash(self) -> "MetricScore":
# Only fill if unset, so deserialization from disk preserves historical values
# and explicit kwargs (e.g., tests) always win.
if self.version is None or self.prompt_hash is None:
# Lazy import to avoid circular dependency:
# eva.models.results -> eva.metrics -> ... -> eva.metrics.utils -> eva.models.results
from eva.metrics.versioning import _CURRENT_METRIC_VERSION, _CURRENT_PROMPT_HASH

if self.version is None:
self.version = _CURRENT_METRIC_VERSION.get()
if self.prompt_hash is None:
self.prompt_hash = _CURRENT_PROMPT_HASH.get()
return self


class PassAtKResult(BaseModel):
"""pass@k and pass^k result for a single metric across multiple trials."""
Expand Down
Loading
Loading