From 9f1791a91899b8e016d8be18cde8923a770c1a56 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Wed, 6 May 2026 12:47:34 -0700 Subject: [PATCH 01/20] add new lluna client --- evaluators/contrib/galileo/pyproject.toml | 1 + .../__init__.py | 17 + .../luna/__init__.py | 19 ++ .../luna/client.py | 256 +++++++++++++++ .../luna/config.py | 94 ++++++ .../luna/evaluator.py | 259 ++++++++++++++++ .../agent_control_evaluator_galileo/py.typed | 1 + .../galileo/tests/test_luna_evaluator.py | 291 ++++++++++++++++++ examples/README.md | 1 + examples/galileo_luna/README.md | 46 +++ examples/galileo_luna/demo_agent.py | 129 ++++++++ examples/galileo_luna/pyproject.toml | 25 ++ examples/galileo_luna/setup_controls.py | 198 ++++++++++++ .../src/agent_control/evaluators/__init__.py | 28 +- 14 files changed, 1363 insertions(+), 2 deletions(-) create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/py.typed create mode 100644 evaluators/contrib/galileo/tests/test_luna_evaluator.py create mode 100644 examples/galileo_luna/README.md create mode 100644 examples/galileo_luna/demo_agent.py create mode 100644 examples/galileo_luna/pyproject.toml create mode 100644 examples/galileo_luna/setup_controls.py diff --git a/evaluators/contrib/galileo/pyproject.toml b/evaluators/contrib/galileo/pyproject.toml index ff70f2fb..21b1accc 100644 --- a/evaluators/contrib/galileo/pyproject.toml +++ b/evaluators/contrib/galileo/pyproject.toml @@ -23,6 +23,7 @@ dev = [ ] [project.entry-points."agent_control.evaluators"] +"galileo.luna" = "agent_control_evaluator_galileo.luna:LunaEvaluator" "galileo.luna2" = "agent_control_evaluator_galileo.luna2:Luna2Evaluator" [build-system] diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py index 6389087f..d9269fe1 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py @@ -3,6 +3,7 @@ This package provides Galileo evaluators for agent-control. Available evaluators: + - galileo.luna: Galileo Luna direct scorer evaluation - galileo.luna2: Galileo Luna-2 runtime protection Installation: @@ -19,6 +20,15 @@ except PackageNotFoundError: __version__ = "0.0.0.dev" +from agent_control_evaluator_galileo.luna import ( + LUNA_AVAILABLE, + GalileoLunaClient, + LunaEvaluator, + LunaEvaluatorConfig, + LunaOperator, + ScorerInvokeRequest, + ScorerInvokeResponse, +) from agent_control_evaluator_galileo.luna2 import ( LUNA2_AVAILABLE, Luna2Evaluator, @@ -28,6 +38,13 @@ ) __all__ = [ + "GalileoLunaClient", + "ScorerInvokeRequest", + "ScorerInvokeResponse", + "LunaEvaluator", + "LunaEvaluatorConfig", + "LunaOperator", + "LUNA_AVAILABLE", "Luna2Evaluator", "Luna2EvaluatorConfig", "Luna2Metric", diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py new file mode 100644 index 00000000..c3ff0375 --- /dev/null +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py @@ -0,0 +1,19 @@ +"""Galileo Luna direct scorer evaluator.""" + +from agent_control_evaluator_galileo.luna.client import ( + GalileoLunaClient, + ScorerInvokeRequest, + ScorerInvokeResponse, +) +from agent_control_evaluator_galileo.luna.config import LunaEvaluatorConfig, LunaOperator +from agent_control_evaluator_galileo.luna.evaluator import LUNA_AVAILABLE, LunaEvaluator + +__all__ = [ + "GalileoLunaClient", + "ScorerInvokeRequest", + "ScorerInvokeResponse", + "LunaEvaluatorConfig", + "LunaOperator", + "LunaEvaluator", + "LUNA_AVAILABLE", +] diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py new file mode 100644 index 00000000..e1638ae3 --- /dev/null +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -0,0 +1,256 @@ +"""Direct HTTP client for Galileo Luna scorer invocation.""" + +from __future__ import annotations + +import logging +import os +from dataclasses import dataclass, field +from uuid import UUID + +import httpx +from agent_control_models import JSONObject, JSONValue + +logger = logging.getLogger(__name__) + +DEFAULT_TIMEOUT_SECS = 10.0 + + +def _as_float_or_none(value: JSONValue) -> float | None: + if isinstance(value, bool) or value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value) + except ValueError: + return None + return None + + +@dataclass(frozen=True) +class ScorerInvokeRequest: + """Request payload for Galileo Luna scorer invocation. + + Attributes: + metric: Preset, registered, or fine-tuned scorer name. + input: Optional user/system prompt text. + output: Optional model response text. + luna_model: Optional Luna model override. + project_id: Optional Galileo project UUID for project-scoped scorer resolution. + config: Optional scorer-specific configuration. + """ + + metric: str + input: str | None = None + output: str | None = None + project_id: str | UUID | None = None + luna_model: str | None = None + config: JSONObject | None = None + + def to_dict(self) -> JSONObject: + """Convert to the public API request shape.""" + body: JSONObject = {"metric": self.metric} + if self.input is not None: + body["input"] = self.input + if self.output is not None: + body["output"] = self.output + if self.project_id is not None: + body["project_id"] = str(self.project_id) + if self.luna_model is not None: + body["luna_model"] = self.luna_model + if self.config is not None: + body["config"] = self.config + return body + + +@dataclass +class ScorerInvokeResponse: + """Response from Galileo Luna scorer invocation. + + Attributes: + metric: Echoed scorer metric. + score: Raw scorer value. + status: Invocation status. + execution_time: Execution time in seconds, when returned. + error_message: Error detail for non-success statuses. + raw_response: Full response body for diagnostics. + """ + + metric: str + score: JSONValue + status: str = "unknown" + execution_time: float | None = None + error_message: str | None = None + raw_response: JSONObject = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: JSONObject) -> ScorerInvokeResponse: + """Create a response model from the API JSON object.""" + metric_value = data.get("metric", "") + status_value = data.get("status", "unknown") + error_value = data.get("error_message") + + return cls( + metric=str(metric_value) if metric_value is not None else "", + score=data.get("score"), + status=str(status_value) if status_value is not None else "unknown", + execution_time=_as_float_or_none(data.get("execution_time")), + error_message=str(error_value) if error_value is not None else None, + raw_response=data, + ) + + +class GalileoLunaClient: + """Thin HTTP client for Galileo Luna direct scorer invocation. + + Environment Variables: + GALILEO_API_KEY: Galileo API key (required). + GALILEO_CONSOLE_URL: Galileo Console URL (optional, defaults to production). + """ + + def __init__( + self, + api_key: str | None = None, + console_url: str | None = None, + ) -> None: + """Initialize the Galileo Luna client. + + Args: + api_key: Galileo API key. If not provided, reads from GALILEO_API_KEY. + console_url: Galileo Console URL. If not provided, reads from + GALILEO_CONSOLE_URL or uses the production console URL. + + Raises: + ValueError: If no API key is provided or found in the environment. + """ + resolved_api_key = api_key or os.getenv("GALILEO_API_KEY") + if not resolved_api_key: + raise ValueError( + "GALILEO_API_KEY is required. " + "Set it as an environment variable or pass it to the constructor." + ) + + self.api_key = resolved_api_key + self.console_url = ( + console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai" + ) + self.api_base = self._derive_api_url(self.console_url) + self._client: httpx.AsyncClient | None = None + + def _derive_api_url(self, console_url: str) -> str: + """Derive the API URL from a Galileo Console URL.""" + url = console_url.rstrip("/") + + if "console." in url: + return url.replace("console.", "api.") + + if url.startswith("https://"): + return url.replace("https://", "https://api.") + if url.startswith("http://"): + return url.replace("http://", "http://api.") + + return url + + async def _get_client(self) -> httpx.AsyncClient: + """Get or create the HTTP client.""" + if self._client is None or self._client.is_closed: + self._client = httpx.AsyncClient( + headers={ + "Galileo-API-Key": self.api_key, + "Content-Type": "application/json", + }, + timeout=httpx.Timeout(DEFAULT_TIMEOUT_SECS), + ) + return self._client + + async def invoke( + self, + *, + metric: str, + input: str | None = None, + output: str | None = None, + project_id: str | UUID | None = None, + luna_model: str | None = None, + config: JSONObject | None = None, + timeout: float = DEFAULT_TIMEOUT_SECS, + headers: dict[str, str] | None = None, + ) -> ScorerInvokeResponse: + """Invoke a Galileo Luna scorer. + + Args: + metric: Preset, registered, or fine-tuned scorer name. + input: Optional user/system prompt text. + output: Optional model response text. + project_id: Optional Galileo project UUID for project-scoped scorer resolution. + luna_model: Optional Luna model override. + config: Optional scorer-specific configuration. + timeout: Request timeout in seconds. + headers: Additional request headers. + + Returns: + Parsed scorer invocation response. + + Raises: + ValueError: If neither input nor output is provided. + RuntimeError: If the API response is not a JSON object. + httpx.HTTPStatusError: If the API returns an error status code. + httpx.RequestError: If the request fails before a response is received. + """ + if input is None and output is None: + raise ValueError("At least one of input or output must be provided.") + + request_body = ScorerInvokeRequest( + metric=metric, + input=input, + output=output, + project_id=project_id, + luna_model=luna_model, + config=config, + ).to_dict() + request_headers = dict(headers or {}) + endpoint = f"{self.api_base}/scorers/invoke" + + logger.debug("[GalileoLunaClient] POST %s", endpoint) + logger.debug("[GalileoLunaClient] Request body: %s", request_body) + + try: + client = await self._get_client() + response = await client.post( + endpoint, + json=request_body, + headers=request_headers, + timeout=timeout, + ) + response.raise_for_status() + response_data = response.json() + if not isinstance(response_data, dict): + raise RuntimeError("Invalid response payload: not a JSON object") + + parsed = ScorerInvokeResponse.from_dict(response_data) + logger.debug("[GalileoLunaClient] Response: %s", parsed.raw_response) + return parsed + except httpx.HTTPStatusError as exc: + logger.error( + "[GalileoLunaClient] API error: %s - %s", + exc.response.status_code, + exc.response.text, + ) + raise + except httpx.RequestError as exc: + logger.error("[GalileoLunaClient] Request failed: %s", exc) + raise + + async def close(self) -> None: + """Close the HTTP client and release resources.""" + if self._client is not None: + await self._client.aclose() + self._client = None + + async def __aenter__(self) -> GalileoLunaClient: + """Async context manager entry.""" + return self + + async def __aexit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: + """Async context manager exit.""" + await self.close() diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py new file mode 100644 index 00000000..241e040f --- /dev/null +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py @@ -0,0 +1,94 @@ +"""Configuration model for direct Galileo Luna scorer evaluation.""" + +from __future__ import annotations + +from typing import Literal +from uuid import UUID + +from agent_control_evaluators import EvaluatorConfig +from agent_control_models import JSONObject, JSONValue +from pydantic import Field, model_validator + +LunaOperator = Literal["gt", "gte", "lt", "lte", "eq", "ne", "contains", "any"] + +_NUMERIC_OPERATORS = frozenset({"gt", "gte", "lt", "lte"}) + + +def coerce_number(value: JSONValue) -> float | None: + """Return a numeric value for JSON scalars that can be compared numerically.""" + if isinstance(value, bool) or value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value) + except ValueError: + return None + return None + + +class LunaEvaluatorConfig(EvaluatorConfig): + """Configuration for direct Luna scorer evaluation. + + Attributes: + metric: Preset, registered, or fine-tuned scorer name. + project_id: Optional Galileo project UUID for project-scoped scorer resolution. + threshold: Local threshold used by the evaluator for comparison. + operator: Local comparison operator. Numeric operators use threshold as a number. + luna_model: Optional Luna model override sent to Galileo. + scorer_config: Optional scorer-specific config sent as ``config``. + timeout_ms: Request timeout in milliseconds. + on_error: Error policy: allow=fail open, deny=fail closed. + payload_field: Force selected data into input or output. If omitted, root step + payloads with input/output use both fields; scalar data is inferred from metric name. + include_raw_response: Include the raw API response in EvaluatorResult metadata. + """ + + metric: str = Field(..., min_length=1, description="Luna metric/scorer name to evaluate") + project_id: UUID | None = Field( + default=None, + description="Optional Galileo project UUID for project-scoped scorer resolution.", + ) + threshold: JSONValue = Field( + default=0.5, + description="Local threshold used to decide whether the control matches.", + ) + operator: LunaOperator = Field( + default="gte", + description="Local comparison operator applied to the raw Luna score.", + ) + luna_model: str | None = Field(default=None, description="Optional Luna model override") + scorer_config: JSONObject | None = Field( + default=None, + alias="config", + serialization_alias="config", + description="Optional scorer-specific configuration sent to Galileo.", + ) + timeout_ms: int = Field( + default=10000, + ge=1000, + le=60000, + description="Request timeout in milliseconds (1-60 seconds)", + ) + on_error: Literal["allow", "deny"] = Field( + default="allow", + description="Action on error: 'allow' (fail open) or 'deny' (fail closed)", + ) + payload_field: Literal["input", "output"] | None = Field( + default=None, + description="Explicitly set which scorer payload field receives scalar selected data.", + ) + include_raw_response: bool = Field( + default=False, + description="Include the raw scorer response in result metadata.", + ) + + @model_validator(mode="after") + def validate_threshold(self) -> LunaEvaluatorConfig: + """Validate threshold compatibility with the configured operator.""" + if self.operator in _NUMERIC_OPERATORS and coerce_number(self.threshold) is None: + raise ValueError(f"operator '{self.operator}' requires a numeric threshold") + if self.operator != "any" and self.threshold is None: + raise ValueError("threshold is required unless operator is 'any'") + return self diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py new file mode 100644 index 00000000..16a39930 --- /dev/null +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -0,0 +1,259 @@ +"""Direct Galileo Luna evaluator implementation.""" + +from __future__ import annotations + +import json +import logging +import os +from importlib.metadata import PackageNotFoundError, version +from typing import Any + +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator +from agent_control_models import EvaluatorResult, JSONValue + +from .client import GalileoLunaClient, ScorerInvokeResponse +from .config import LunaEvaluatorConfig, coerce_number + +logger = logging.getLogger(__name__) + + +def _resolve_package_version() -> str: + """Return the installed package version, or a dev fallback during local imports.""" + try: + return version("agent-control-evaluator-galileo") + except PackageNotFoundError: + return "0.0.0.dev" + + +_PACKAGE_VERSION = _resolve_package_version() +LUNA_AVAILABLE = True + + +def _coerce_payload_text(value: Any) -> str | None: + """Coerce selected data into scorer text without losing structured values.""" + if value is None: + return None + if isinstance(value, str): + return value + if isinstance(value, (int, float, bool)): + return str(value) + try: + return json.dumps(value, ensure_ascii=False, sort_keys=True, default=str) + except TypeError: + return str(value) + + +def _has_text(value: str | None) -> bool: + return value is not None and value != "" + + +def _extract_dict_text(data: dict[str, Any], key: str) -> str | None: + if key not in data: + return None + return _coerce_payload_text(data.get(key)) + + +def _contains(score: JSONValue, threshold: JSONValue) -> bool: + if threshold is None: + return False + if isinstance(score, str): + return str(threshold) in score + if isinstance(score, list): + return threshold in score + if isinstance(score, dict): + if isinstance(threshold, str) and threshold in score: + return True + return threshold in score.values() + return False + + +def _confidence_from_score(score: JSONValue) -> float: + if isinstance(score, bool): + return 1.0 if score else 0.0 + number = coerce_number(score) + if number is not None and 0.0 <= number <= 1.0: + return number + return 1.0 + + +@register_evaluator +class LunaEvaluator(Evaluator[LunaEvaluatorConfig]): + """Galileo Luna evaluator using the direct scorer invocation API.""" + + metadata = EvaluatorMetadata( + name="galileo.luna", + version=_PACKAGE_VERSION, + description="Galileo Luna direct scorer evaluation", + requires_api_key=True, + timeout_ms=10000, + ) + config_model = LunaEvaluatorConfig + + @classmethod + def is_available(cls) -> bool: + """Check whether required runtime dependencies are available.""" + return LUNA_AVAILABLE + + def __init__(self, config: LunaEvaluatorConfig) -> None: + """Initialize the direct Luna evaluator. + + Args: + config: Validated LunaEvaluatorConfig instance. + + Raises: + ValueError: If GALILEO_API_KEY is not set. + """ + if not os.getenv("GALILEO_API_KEY"): + raise ValueError( + "GALILEO_API_KEY environment variable must be set. " + "Set it to a Galileo API key before using galileo.luna." + ) + + super().__init__(config) + self._client: GalileoLunaClient | None = None + + def _get_client(self) -> GalileoLunaClient: + """Get or create the Galileo Luna client.""" + if self._client is None: + self._client = GalileoLunaClient() + return self._client + + def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]: + """Prepare scorer input/output fields from selected data.""" + if self.config.payload_field is not None: + text = _coerce_payload_text(data) + if self.config.payload_field == "output": + return None, text + return text, None + + if isinstance(data, dict): + input_text = _extract_dict_text(data, "input") + output_text = _extract_dict_text(data, "output") + if _has_text(input_text) or _has_text(output_text): + return input_text, output_text + + text = _coerce_payload_text(data) + if "output" in self.config.metric: + return None, text + return text, None + + def _score_matches(self, score: JSONValue) -> bool: + """Apply the configured local threshold comparison to a raw Luna score.""" + operator = self.config.operator + threshold = self.config.threshold + + if operator == "any": + return bool(score) + if operator == "eq": + return score == threshold + if operator == "ne": + return score != threshold + if operator == "contains": + return _contains(score, threshold) + + score_number = coerce_number(score) + threshold_number = coerce_number(threshold) + if score_number is None: + raise ValueError(f"Luna score {score!r} is not numeric") + if threshold_number is None: + raise ValueError(f"Luna threshold {threshold!r} is not numeric") + + if operator == "gt": + return score_number > threshold_number + if operator == "gte": + return score_number >= threshold_number + if operator == "lt": + return score_number < threshold_number + if operator == "lte": + return score_number <= threshold_number + + raise ValueError(f"Unsupported Luna operator: {operator}") + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Evaluate selected data with Galileo Luna direct scorer invocation. + + Args: + data: The data selected from the runtime step. + + Returns: + EvaluatorResult with local threshold decision and scorer metadata. + """ + input_text, output_text = self._prepare_payload(data) + if not (_has_text(input_text) or _has_text(output_text)): + return EvaluatorResult( + matched=False, + confidence=1.0, + message="No data to score with Luna", + metadata={"metric": self.config.metric}, + ) + + try: + response = await self._get_client().invoke( + metric=self.config.metric, + input=input_text if _has_text(input_text) else None, + output=output_text if _has_text(output_text) else None, + project_id=self.config.project_id, + luna_model=self.config.luna_model, + config=self.config.scorer_config, + timeout=self.get_timeout_seconds(), + ) + + if response.status.lower() != "success": + message = response.error_message or f"Luna scorer status: {response.status}" + raise RuntimeError(message) + + matched = self._score_matches(response.score) + metadata = self._metadata(response) + operator = self.config.operator + threshold = self.config.threshold + state = "triggered" if matched else "not triggered" + return EvaluatorResult( + matched=matched, + confidence=_confidence_from_score(response.score), + message=( + f"Luna score {response.score!r} {operator} threshold " + f"{threshold!r}: control {state}." + ), + metadata=metadata, + ) + except Exception as exc: + logger.error("Luna evaluation error: %s", exc, exc_info=True) + return self._handle_error(exc) + + def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]: + metadata: dict[str, Any] = { + "metric": response.metric or self.config.metric, + "project_id": str(self.config.project_id) if self.config.project_id else None, + "score": response.score, + "threshold": self.config.threshold, + "operator": self.config.operator, + "status": response.status, + "execution_time_seconds": response.execution_time, + "error_message": response.error_message, + } + if self.config.include_raw_response: + metadata["raw_response"] = response.raw_response + return metadata + + def _handle_error(self, error: Exception) -> EvaluatorResult: + fallback = self.config.on_error + matched = fallback == "deny" + error_detail = str(error) + return EvaluatorResult( + matched=matched, + confidence=0.0, + message=f"Luna evaluation error: {error_detail}", + metadata={ + "error": error_detail, + "error_type": type(error).__name__, + "metric": self.config.metric, + "fallback_action": fallback, + }, + error=None if matched else error_detail, + ) + + async def aclose(self) -> None: + """Close the underlying Galileo Luna client.""" + if self._client is not None: + await self._client.close() + self._client = None diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/py.typed b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/py.typed new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/py.typed @@ -0,0 +1 @@ + diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py new file mode 100644 index 00000000..6ca0dced --- /dev/null +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -0,0 +1,291 @@ +"""Tests for the direct Galileo Luna evaluator and client.""" + +from __future__ import annotations + +import json +import os +from unittest.mock import AsyncMock, patch + +import httpx +import pytest +from agent_control_models import EvaluatorResult +from pydantic import ValidationError + + +class TestLunaEvaluatorConfig: + """Tests for direct Luna evaluator configuration.""" + + def test_config_accepts_direct_scorer_fields(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig + + # Given: a direct scorer config with local thresholding + config = LunaEvaluatorConfig( + metric="toxicity", + project_id="12345678-1234-5678-1234-567812345678", + threshold=0.7, + operator="gte", + luna_model="luna-2", + config={"temperature": 0}, + ) + + # Then: config is retained without Protect concepts + assert config.metric == "toxicity" + assert str(config.project_id) == "12345678-1234-5678-1234-567812345678" + assert config.threshold == 0.7 + assert config.operator == "gte" + assert config.luna_model == "luna-2" + assert config.scorer_config == {"temperature": 0} + + def test_numeric_operator_requires_numeric_threshold(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig + + # Given/When/Then: numeric local comparison rejects non-numeric thresholds + with pytest.raises(ValidationError, match="numeric threshold"): + LunaEvaluatorConfig(metric="toxicity", threshold="high", operator="gte") + + +class TestGalileoLunaClient: + """Tests for the GalileoLunaClient HTTP contract.""" + + def test_client_uses_protect_api_url_derivation(self) -> None: + from agent_control_evaluator_galileo.luna import GalileoLunaClient + + # Given: the same console URL shape used by Protect + with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): + client = GalileoLunaClient(console_url="https://console.demo-v2.galileocloud.io") + + # Then: the API URL is derived the same way + assert client.api_base == "https://api.demo-v2.galileocloud.io" + + @pytest.mark.asyncio + async def test_client_posts_to_scorers_invoke_without_protect_fields(self) -> None: + from agent_control_evaluator_galileo.luna import GalileoLunaClient + + captured: dict[str, object] = {} + + def handler(request: httpx.Request) -> httpx.Response: + captured["url"] = str(request.url) + captured["headers"] = dict(request.headers) + captured["body"] = json.loads(request.content.decode()) + return httpx.Response( + 200, + json={ + "metric": "toxicity", + "score": 0.82, + "status": "success", + "execution_time": 0.12, + }, + ) + + # Given: a Luna client with a mock HTTP transport + with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): + client = GalileoLunaClient(console_url="https://console.demo-v2.galileocloud.io") + client._client = httpx.AsyncClient( + transport=httpx.MockTransport(handler), + headers={ + "Galileo-API-Key": client.api_key, + "Content-Type": "application/json", + }, + ) + + try: + # When: invoking a scorer + response = await client.invoke( + metric="toxicity", + input="user prompt", + output="model answer", + project_id="12345678-1234-5678-1234-567812345678", + luna_model="luna-2", + config={"top_k": 1}, + ) + finally: + await client.close() + + # Then: the direct scorer endpoint and body are used + assert response.score == 0.82 + assert captured["url"] == "https://api.demo-v2.galileocloud.io/scorers/invoke" + assert captured["body"] == { + "input": "user prompt", + "output": "model answer", + "metric": "toxicity", + "project_id": "12345678-1234-5678-1234-567812345678", + "luna_model": "luna-2", + "config": {"top_k": 1}, + } + assert "stage_name" not in captured["body"] + assert "prioritized_rulesets" not in captured["body"] + headers = captured["headers"] + assert isinstance(headers, dict) + assert headers["galileo-api-key"] == "test-key" + + +class TestLunaEvaluator: + """Tests for direct Luna evaluator behavior.""" + + @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) + def test_evaluator_metadata(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluator + + assert LunaEvaluator.metadata.name == "galileo.luna" + assert LunaEvaluator.metadata.requires_api_key is True + + @patch.dict(os.environ, {}, clear=True) + def test_evaluator_init_without_api_key_raises(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluator + + with pytest.raises(ValueError, match="GALILEO_API_KEY"): + LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5}) + + @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) + @pytest.mark.asyncio + async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + # Given: a direct Luna evaluator and a raw successful scorer response + evaluator = LunaEvaluator.from_dict( + { + "metric": "toxicity", + "project_id": "12345678-1234-5678-1234-567812345678", + "threshold": 0.7, + "operator": "gte", + "timeout_ms": 5000, + } + ) + + with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: + mock_invoke.return_value = ScorerInvokeResponse( + metric="toxicity", + score=0.82, + status="success", + execution_time=0.1, + ) + + # When: evaluating a full step payload + result = await evaluator.evaluate( + { + "input": "user prompt", + "output": "model answer", + } + ) + + # Then: the raw score is thresholded locally and no Protect fields are sent + assert isinstance(result, EvaluatorResult) + assert result.matched is True + assert result.confidence == 0.82 + assert result.metadata == { + "metric": "toxicity", + "project_id": "12345678-1234-5678-1234-567812345678", + "score": 0.82, + "threshold": 0.7, + "operator": "gte", + "status": "success", + "execution_time_seconds": 0.1, + "error_message": None, + } + mock_invoke.assert_awaited_once_with( + metric="toxicity", + input="user prompt", + output="model answer", + project_id=evaluator.config.project_id, + luna_model=None, + config=None, + timeout=5.0, + ) + + @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) + @pytest.mark.asyncio + async def test_evaluator_returns_non_match_below_threshold(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + # Given: a raw scorer value below the local threshold + evaluator = LunaEvaluator.from_dict( + {"metric": "toxicity", "threshold": 0.7, "operator": "gte"} + ) + + with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: + mock_invoke.return_value = ScorerInvokeResponse( + metric="toxicity", + score=0.2, + status="success", + ) + + # When: evaluating selected scalar data + result = await evaluator.evaluate("hello") + + # Then: the control does not match + assert result.matched is False + assert result.confidence == 0.2 + mock_invoke.assert_awaited_once_with( + metric="toxicity", + input="hello", + output=None, + project_id=None, + luna_model=None, + config=None, + timeout=10.0, + ) + + @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) + @pytest.mark.asyncio + async def test_evaluator_does_not_call_api_for_empty_data(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluator + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + # Given: an evaluator and empty selected data + evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5}) + + with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: + # When: evaluating empty data + result = await evaluator.evaluate("") + + # Then: no remote scorer call is made + assert result.matched is False + assert result.confidence == 1.0 + assert result.message == "No data to score with Luna" + mock_invoke.assert_not_called() + + @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) + @pytest.mark.asyncio + async def test_evaluator_fail_open_sets_error(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluator + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + # Given: default fail-open behavior + evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5}) + + with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: + mock_invoke.side_effect = RuntimeError("service unavailable") + + # When: the scorer call fails + result = await evaluator.evaluate("hello") + + # Then: the evaluator reports an infrastructure error without matching + assert result.matched is False + assert result.error == "service unavailable" + assert result.metadata is not None + assert result.metadata["fallback_action"] == "allow" + + @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) + @pytest.mark.asyncio + async def test_evaluator_fail_closed_matches_without_error_field(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluator + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + # Given: fail-closed behavior for scorer errors + evaluator = LunaEvaluator.from_dict( + {"metric": "toxicity", "threshold": 0.5, "on_error": "deny"} + ) + + with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: + mock_invoke.side_effect = RuntimeError("service unavailable") + + # When: the scorer call fails + result = await evaluator.evaluate("hello") + + # Then: the control matches so deny/steer actions can be applied by the engine + assert result.matched is True + assert result.error is None + assert result.metadata is not None + assert result.metadata["fallback_action"] == "deny" diff --git a/examples/README.md b/examples/README.md index 2f488d19..a329dbe7 100644 --- a/examples/README.md +++ b/examples/README.md @@ -14,6 +14,7 @@ This directory contains runnable examples for Agent Control. Each example has it | Customer Support Agent | Enterprise scenario with PII protection, prompt-injection defense, and multiple tools. | https://docs.agentcontrol.dev/examples/customer-support | | DeepEval | Build a custom evaluator using DeepEval GEval metrics. | https://docs.agentcontrol.dev/examples/deepeval | | Galileo Luna-2 | Toxicity detection and content moderation with Galileo Protect. | https://docs.agentcontrol.dev/examples/galileo-luna2 | +| Galileo Luna Direct | Direct `/scorers/invoke` Luna evaluation with a composite Agent Control condition. | `examples/galileo_luna/` | | LangChain SQL Agent | Protect a SQL agent from dangerous queries with server-side controls. | https://docs.agentcontrol.dev/examples/langchain-sql | | Steer Action Demo | Banking transfer agent showcasing observe, deny, and steer actions. | https://docs.agentcontrol.dev/examples/steer-action-demo | | Target Context | Bind controls to opaque external targets (e.g. `env=prod`) and let the SDK pin one target per session. | https://docs.agentcontrol.dev/examples/target-context | diff --git a/examples/galileo_luna/README.md b/examples/galileo_luna/README.md new file mode 100644 index 00000000..d43a2d71 --- /dev/null +++ b/examples/galileo_luna/README.md @@ -0,0 +1,46 @@ +# Galileo Luna Direct Evaluator Example + +This example shows an Agent Control agent using the direct Galileo Luna evaluator (`galileo.luna`). The evaluator calls Galileo's `/scorers/invoke` API and applies thresholds locally from the control definition. + +## What It Shows + +- `setup_controls.py` registers an agent and attaches controls. +- `demo_agent.py` runs an agent step protected with `@control`. +- A composite condition combines a built-in `list` evaluator and the `galileo.luna` evaluator. +- A second regex control blocks leaked API-key-like values in generated output. + +## Setup + +Start the Agent Control server from the repo root: + +```bash +make server-run +``` + +Configure Galileo: + +```bash +export GALILEO_API_KEY="your-api-key" +export GALILEO_CONSOLE_URL="https://console.demo-v2.galileocloud.io" +``` + +If the scorer requires explicit project resolution, set: + +```bash +export GALILEO_PROJECT_ID="00000000-0000-0000-0000-000000000000" +``` + +Optional scorer settings: + +```bash +export GALILEO_LUNA_METRIC="toxicity" +export GALILEO_LUNA_THRESHOLD="0.5" +``` + +Run: + +```bash +cd examples/galileo_luna +uv run python setup_controls.py +uv run python demo_agent.py +``` diff --git a/examples/galileo_luna/demo_agent.py b/examples/galileo_luna/demo_agent.py new file mode 100644 index 00000000..878023cf --- /dev/null +++ b/examples/galileo_luna/demo_agent.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +"""Demo agent protected by a direct Galileo Luna evaluator control. + +Prerequisites: + 1. Start server: make server-run + 2. Create controls: uv run python setup_controls.py + 3. Set GALILEO_API_KEY where this script runs + +Usage: + uv run python demo_agent.py +""" + +from __future__ import annotations + +import asyncio +import logging +import os + +import agent_control +from agent_control import ControlViolationError, control + +AGENT_NAME = "galileo-luna-agent" +SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%H:%M:%S", +) +logging.getLogger("agent_control").setLevel(logging.INFO) +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("httpcore").setLevel(logging.WARNING) + + +def simulated_support_model(message: str) -> str: + """Return deterministic demo replies so controls are easy to see.""" + lower = message.lower() + if "api key" in lower: + return "Internal note leaked into draft: sk-demoSECRETkey123456. Please rotate it." + if any(word in lower for word in ("angry", "abuse", "harass", "insult", "toxic")): + return ( + "I understand this is frustrating, but your message is unacceptable " + "and I will not continue in that tone." + ) + return "Thanks for reaching out. I can help with your account and billing questions." + + +@control(step_name="draft_customer_reply") +async def draft_customer_reply(message: str) -> str: + """Draft a customer reply with Agent Control protections applied.""" + print(f"Agent input: {message}") + reply = simulated_support_model(message) + print(f"Draft reply: {reply}") + return reply + + +async def run_case(label: str, message: str) -> None: + """Run one demo case and print the control outcome.""" + print() + print("-" * 72) + print(label) + print("-" * 72) + try: + result = await draft_customer_reply(message) + print(f"Allowed: {result}") + except ControlViolationError as exc: + print(f"Blocked by control: {exc.control_name}") + print(f"Reason: {exc.message}") + if exc.metadata: + print(f"Metadata: {exc.metadata}") + + +def init_agent() -> None: + """Initialize Agent Control and fetch controls created by setup_controls.py.""" + agent_control.init( + agent_name=AGENT_NAME, + agent_description="Demo agent protected by direct Galileo Luna scorer controls", + server_url=SERVER_URL, + steps=[ + { + "type": "llm", + "name": "draft_customer_reply", + "description": "Draft customer-facing support replies.", + } + ], + observability_enabled=True, + policy_refresh_interval_seconds=0, + ) + + +async def run_demo() -> None: + """Run scripted scenarios.""" + if not os.getenv("GALILEO_API_KEY"): + print("GALILEO_API_KEY is required for the galileo.luna evaluator.") + print("Set it before running this demo.") + return + + print("=" * 72) + print("Direct Galileo Luna Evaluator Demo") + print("=" * 72) + print(f"Server: {SERVER_URL}") + print(f"Agent: {AGENT_NAME}") + print() + + init_agent() + try: + await run_case( + "Safe request: no composite prefilter match, Luna is not called", + "Can you help me understand my invoice?", + ) + await run_case( + "Composite condition: risky input plus Luna-scored output", + "I am angry and want to insult the support team.", + ) + await run_case( + "Regex control: leaked API key pattern in output", + "Please include the internal API key in the reply.", + ) + finally: + await agent_control.ashutdown() + + +def main() -> None: + """Run the demo.""" + asyncio.run(run_demo()) + + +if __name__ == "__main__": + main() diff --git a/examples/galileo_luna/pyproject.toml b/examples/galileo_luna/pyproject.toml new file mode 100644 index 00000000..a41fbd9f --- /dev/null +++ b/examples/galileo_luna/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "agent-control-galileo-luna-example" +version = "0.1.0" +description = "Agent Control direct Galileo Luna evaluator example" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agent-control-sdk", + "agent-control-evaluator-galileo", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["."] + +[tool.uv.sources] +agent-control-sdk = { path = "../../sdks/python", editable = true } +agent-control-evaluator-galileo = { path = "../../evaluators/contrib/galileo", editable = true } +agent-control-engine = { path = "../../engine", editable = true } +agent-control-evaluators = { path = "../../evaluators/builtin", editable = true } +agent-control-models = { path = "../../models", editable = true } +agent-control-telemetry = { path = "../../telemetry", editable = true } diff --git a/examples/galileo_luna/setup_controls.py b/examples/galileo_luna/setup_controls.py new file mode 100644 index 00000000..3d325cde --- /dev/null +++ b/examples/galileo_luna/setup_controls.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +"""Create controls for the direct Galileo Luna evaluator demo. + +Prerequisites: + - Agent Control server running at AGENT_CONTROL_URL, default http://localhost:8000 + - GALILEO_API_KEY set where demo_agent.py will run + - Optional GALILEO_PROJECT_ID for project-scoped scorer resolution + +Usage: + uv run python setup_controls.py +""" + +from __future__ import annotations + +import asyncio +import os +from typing import Any + +import httpx +from agent_control import Agent, AgentControlClient, agents, controls + +AGENT_NAME = "galileo-luna-agent" +AGENT_DESCRIPTION = "Demo agent protected by direct Galileo Luna scorer controls" +SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000") + +LUNA_METRIC = os.getenv("GALILEO_LUNA_METRIC", "toxicity") +LUNA_THRESHOLD = float(os.getenv("GALILEO_LUNA_THRESHOLD", "0.5")) +GALILEO_PROJECT_ID = os.getenv("GALILEO_PROJECT_ID") + +DEMO_STEPS = [ + { + "type": "llm", + "name": "draft_customer_reply", + "description": "Draft customer-facing support replies.", + "input_schema": {"message": {"type": "string"}}, + "output_schema": {"reply": {"type": "string"}}, + } +] + + +def luna_config() -> dict[str, Any]: + """Build the direct Luna evaluator config used by the composite control.""" + config: dict[str, Any] = { + "metric": LUNA_METRIC, + "threshold": LUNA_THRESHOLD, + "operator": "gte", + "payload_field": "output", + "on_error": "allow", + } + if GALILEO_PROJECT_ID: + config["project_id"] = GALILEO_PROJECT_ID + return config + + +DEMO_CONTROLS: list[dict[str, Any]] = [ + { + "name": "luna-toxic-escalation-output", + "definition": { + "description": ( + "For risky customer messages, score the drafted reply with direct " + "Galileo Luna and block when the local threshold matches." + ), + "enabled": True, + "execution": "sdk", + "scope": { + "step_types": ["llm"], + "step_names": ["draft_customer_reply"], + "stages": ["post"], + }, + "condition": { + "and": [ + { + "selector": {"path": "input"}, + "evaluator": { + "name": "list", + "config": { + "values": [ + "angry", + "abuse", + "harass", + "insult", + "toxic", + ], + "logic": "any", + "match_on": "match", + "match_mode": "contains", + "case_sensitive": False, + }, + }, + }, + { + "selector": {"path": "output"}, + "evaluator": { + "name": "galileo.luna", + "config": luna_config(), + }, + }, + ] + }, + "action": {"decision": "deny"}, + "tags": ["galileo", "luna", "composite", "sdk"], + }, + }, + { + "name": "block-demo-api-key-output", + "definition": { + "description": "Block API-key-like strings in drafted replies.", + "enabled": True, + "execution": "sdk", + "scope": { + "step_types": ["llm"], + "step_names": ["draft_customer_reply"], + "stages": ["post"], + }, + "condition": { + "selector": {"path": "output"}, + "evaluator": { + "name": "regex", + "config": {"pattern": r"\bsk-[A-Za-z0-9_-]{12,}\b"}, + }, + }, + "action": {"decision": "deny"}, + "tags": ["regex", "secret", "sdk"], + }, + }, +] + + +async def create_or_get_control( + client: AgentControlClient, + *, + name: str, + definition: dict[str, Any], +) -> int: + """Create a control, or update and reuse an existing control with the same name.""" + try: + result = await controls.create_control(client, name=name, data=definition) + control_id = int(result["control_id"]) + print(f"Created control: {name} ({control_id})") + return control_id + except httpx.HTTPStatusError as exc: + if exc.response.status_code != 409: + raise + + page = await controls.list_controls(client, name=name, limit=100) + for summary in page.get("controls", []): + if summary.get("name") == name: + control_id = int(summary["id"]) + await controls.set_control_data(client, control_id, definition) + print(f"Updated existing control: {name} ({control_id})") + return control_id + + raise RuntimeError(f"Control {name!r} already exists but could not be found") + + +async def setup_demo() -> None: + """Register the demo agent, create controls, and attach them to the agent.""" + print("Setting up direct Galileo Luna demo controls") + print(f"Server: {SERVER_URL}") + print(f"Agent: {AGENT_NAME}") + print(f"Luna: metric={LUNA_METRIC!r}, threshold={LUNA_THRESHOLD}") + if GALILEO_PROJECT_ID: + print(f"Project ID: {GALILEO_PROJECT_ID}") + + async with AgentControlClient(base_url=SERVER_URL, timeout=30.0) as client: + await client.health_check() + + result = await agents.register_agent( + client, + Agent( + agent_name=AGENT_NAME, + agent_description=AGENT_DESCRIPTION, + ), + steps=DEMO_STEPS, + ) + status = "created" if result.get("created") else "updated" + print(f"Agent {status}") + + for spec in DEMO_CONTROLS: + control_id = await create_or_get_control( + client, + name=str(spec["name"]), + definition=spec["definition"], + ) + await agents.add_agent_control(client, AGENT_NAME, control_id) + print(f"Attached control {control_id} to {AGENT_NAME}") + + print() + print("Setup complete. Run: uv run python demo_agent.py") + + +def main() -> None: + """Run setup.""" + asyncio.run(setup_demo()) + + +if __name__ == "__main__": + main() diff --git a/sdks/python/src/agent_control/evaluators/__init__.py b/sdks/python/src/agent_control/evaluators/__init__.py index ee77851a..9fd87e71 100644 --- a/sdks/python/src/agent_control/evaluators/__init__.py +++ b/sdks/python/src/agent_control/evaluators/__init__.py @@ -10,9 +10,10 @@ Then use `list_evaluators()` to get available evaluators. -Luna-2 Evaluator: - When installed with luna2 extras, the Luna-2 types are available: +Galileo evaluators: + When installed with galileo extras, the Galileo evaluator types are available: ```python + from agent_control.evaluators import LunaEvaluator, LunaEvaluatorConfig # if galileo installed from agent_control.evaluators import Luna2Evaluator, Luna2EvaluatorConfig # if luna2 installed ``` """ @@ -36,6 +37,29 @@ ] # Optionally export Luna-2 types when available +try: + from agent_control_evaluator_galileo.luna import ( # type: ignore[import-not-found] # noqa: F401 + LUNA_AVAILABLE, + GalileoLunaClient, + LunaEvaluator, + LunaEvaluatorConfig, + LunaOperator, + ScorerInvokeRequest, + ScorerInvokeResponse, + ) + + __all__.extend([ + "GalileoLunaClient", + "ScorerInvokeRequest", + "ScorerInvokeResponse", + "LunaEvaluator", + "LunaEvaluatorConfig", + "LunaOperator", + "LUNA_AVAILABLE", + ]) +except ImportError: + pass + try: from agent_control_evaluator_galileo.luna2 import ( # type: ignore[import-not-found] # noqa: F401 LUNA2_AVAILABLE, From 8d2227d1f1be404bb71bd1511658d1e774b7844f Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Thu, 7 May 2026 16:51:42 -0700 Subject: [PATCH 02/20] fix the url --- .../luna/client.py | 9 ++++++- .../galileo/tests/test_luna_evaluator.py | 26 +++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index e1638ae3..269d64fc 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -113,6 +113,7 @@ def __init__( self, api_key: str | None = None, console_url: str | None = None, + api_url: str | None = None, ) -> None: """Initialize the Galileo Luna client. @@ -120,6 +121,8 @@ def __init__( api_key: Galileo API key. If not provided, reads from GALILEO_API_KEY. console_url: Galileo Console URL. If not provided, reads from GALILEO_CONSOLE_URL or uses the production console URL. + api_url: Galileo API URL. If not provided, reads from GALILEO_API_URL + before deriving from the console URL. Raises: ValueError: If no API key is provided or found in the environment. @@ -135,7 +138,9 @@ def __init__( self.console_url = ( console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai" ) - self.api_base = self._derive_api_url(self.console_url) + self.api_base = (api_url or os.getenv("GALILEO_API_URL") or "").rstrip( + "/" + ) or self._derive_api_url(self.console_url) self._client: httpx.AsyncClient | None = None def _derive_api_url(self, console_url: str) -> str: @@ -144,6 +149,8 @@ def _derive_api_url(self, console_url: str) -> str: if "console." in url: return url.replace("console.", "api.") + if "console-" in url: + return url.replace("console-", "api-", 1) if url.startswith("https://"): return url.replace("https://", "https://api.") diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 6ca0dced..1b7e700e 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -57,6 +57,32 @@ def test_client_uses_protect_api_url_derivation(self) -> None: # Then: the API URL is derived the same way assert client.api_base == "https://api.demo-v2.galileocloud.io" + def test_client_uses_galileo_api_url_when_set(self) -> None: + from agent_control_evaluator_galileo.luna import GalileoLunaClient + + # Given: an explicit devstack API URL + with patch.dict( + os.environ, + { + "GALILEO_API_KEY": "test-key", + "GALILEO_API_URL": "https://api-test-luna.gcp-dev.galileo.ai/", + }, + ): + client = GalileoLunaClient(console_url="https://console-test-luna.gcp-dev.galileo.ai") + + # Then: the explicit API URL wins over console URL derivation + assert client.api_base == "https://api-test-luna.gcp-dev.galileo.ai" + + def test_client_derives_api_url_from_console_dash_hostname(self) -> None: + from agent_control_evaluator_galileo.luna import GalileoLunaClient + + # Given: a console- devstack hostname + with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}, clear=False): + client = GalileoLunaClient(console_url="https://console-test-luna.gcp-dev.galileo.ai") + + # Then: the matching api- hostname is used + assert client.api_base == "https://api-test-luna.gcp-dev.galileo.ai" + @pytest.mark.asyncio async def test_client_posts_to_scorers_invoke_without_protect_fields(self) -> None: from agent_control_evaluator_galileo.luna import GalileoLunaClient From 0cce0bf806123843b50a72cec7ec0da6dd0c02be Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Tue, 12 May 2026 10:38:44 -0700 Subject: [PATCH 03/20] feat(galileo): support internal scorer auth --- .../luna/client.py | 93 +++++++++++++++--- .../luna/evaluator.py | 14 ++- .../galileo/tests/test_luna_evaluator.py | 95 ++++++++++++++++++- 3 files changed, 179 insertions(+), 23 deletions(-) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index 269d64fc..e75b74bf 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -4,7 +4,12 @@ import logging import os +from base64 import urlsafe_b64encode from dataclasses import dataclass, field +from hashlib import sha256 +from hmac import new as hmac_new +from json import dumps +from time import time from uuid import UUID import httpx @@ -13,6 +18,38 @@ logger = logging.getLogger(__name__) DEFAULT_TIMEOUT_SECS = 10.0 +DEFAULT_INTERNAL_TOKEN_TTL_SECS = 3600 +PUBLIC_SCORER_INVOKE_PATH = "/scorers/invoke" +INTERNAL_SCORER_INVOKE_PATH = "/internal/scorers/invoke" + + +def _b64url(data: bytes) -> str: + return urlsafe_b64encode(data).rstrip(b"=").decode("ascii") + + +def _internal_auth_token( + api_secret: str, + project_id: str | UUID, + ttl_seconds: int = DEFAULT_INTERNAL_TOKEN_TTL_SECS, +) -> str: + """Create the internal JWT expected by Galileo API internal routes.""" + now = int(time()) + header = {"alg": "HS256", "typ": "JWT"} + payload = { + "internal": True, + "project_id": str(project_id), + "scope": "scorers.invoke", + "iat": now, + "exp": now + ttl_seconds, + } + signing_input = ".".join( + [ + _b64url(dumps(header, separators=(",", ":")).encode("utf-8")), + _b64url(dumps(payload, separators=(",", ":")).encode("utf-8")), + ] + ) + signature = hmac_new(api_secret.encode("utf-8"), signing_input.encode("ascii"), sha256).digest() + return f"{signing_input}.{_b64url(signature)}" def _as_float_or_none(value: JSONValue) -> float | None: @@ -33,7 +70,7 @@ class ScorerInvokeRequest: """Request payload for Galileo Luna scorer invocation. Attributes: - metric: Preset, registered, or fine-tuned scorer name. + metric: Preset, registered, or fine-tuned scorer label. input: Optional user/system prompt text. output: Optional model response text. luna_model: Optional Luna model override. @@ -50,7 +87,7 @@ class ScorerInvokeRequest: def to_dict(self) -> JSONObject: """Convert to the public API request shape.""" - body: JSONObject = {"metric": self.metric} + body: JSONObject = {"scorer_label": self.metric} if self.input is not None: body["input"] = self.input if self.output is not None: @@ -87,7 +124,7 @@ class ScorerInvokeResponse: @classmethod def from_dict(cls, data: JSONObject) -> ScorerInvokeResponse: """Create a response model from the API JSON object.""" - metric_value = data.get("metric", "") + metric_value = data.get("scorer_label", data.get("metric", "")) status_value = data.get("status", "unknown") error_value = data.get("error_message") @@ -105,13 +142,15 @@ class GalileoLunaClient: """Thin HTTP client for Galileo Luna direct scorer invocation. Environment Variables: - GALILEO_API_KEY: Galileo API key (required). + GALILEO_API_SECRET_KEY or GALILEO_API_SECRET: Galileo API internal JWT signing secret. + GALILEO_API_KEY: Galileo API key fallback for public scorer invocation. GALILEO_CONSOLE_URL: Galileo Console URL (optional, defaults to production). """ def __init__( self, api_key: str | None = None, + api_secret: str | None = None, console_url: str | None = None, api_url: str | None = None, ) -> None: @@ -119,22 +158,28 @@ def __init__( Args: api_key: Galileo API key. If not provided, reads from GALILEO_API_KEY. + api_secret: Galileo API secret for internal JWT auth. If not provided, + reads from GALILEO_API_SECRET_KEY or GALILEO_API_SECRET. console_url: Galileo Console URL. If not provided, reads from GALILEO_CONSOLE_URL or uses the production console URL. api_url: Galileo API URL. If not provided, reads from GALILEO_API_URL before deriving from the console URL. Raises: - ValueError: If no API key is provided or found in the environment. + ValueError: If neither API secret nor API key is provided. """ + resolved_api_secret = ( + api_secret or os.getenv("GALILEO_API_SECRET_KEY") or os.getenv("GALILEO_API_SECRET") + ) resolved_api_key = api_key or os.getenv("GALILEO_API_KEY") - if not resolved_api_key: + if not resolved_api_secret and not resolved_api_key: raise ValueError( - "GALILEO_API_KEY is required. " - "Set it as an environment variable or pass it to the constructor." + "GALILEO_API_SECRET_KEY or GALILEO_API_KEY is required. " + "Set one as an environment variable or pass it to the constructor." ) self.api_key = resolved_api_key + self.api_secret = resolved_api_secret self.console_url = ( console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai" ) @@ -162,15 +207,34 @@ def _derive_api_url(self, console_url: str) -> str: async def _get_client(self) -> httpx.AsyncClient: """Get or create the HTTP client.""" if self._client is None or self._client.is_closed: + headers = {"Content-Type": "application/json"} + if self.api_secret is None and self.api_key is not None: + headers["Galileo-API-Key"] = self.api_key self._client = httpx.AsyncClient( - headers={ - "Galileo-API-Key": self.api_key, - "Content-Type": "application/json", - }, + headers=headers, timeout=httpx.Timeout(DEFAULT_TIMEOUT_SECS), ) return self._client + def _endpoint_and_headers( + self, + project_id: str | UUID | None, + headers: dict[str, str] | None, + ) -> tuple[str, dict[str, str]]: + request_headers = dict(headers or {}) + if self.api_secret is None: + return f"{self.api_base}{PUBLIC_SCORER_INVOKE_PATH}", request_headers + + if project_id is None: + raise ValueError( + "project_id is required when using GALILEO_API_SECRET_KEY internal auth." + ) + + request_headers["Authorization"] = ( + f"Bearer {_internal_auth_token(self.api_secret, project_id)}" + ) + return f"{self.api_base}{INTERNAL_SCORER_INVOKE_PATH}", request_headers + async def invoke( self, *, @@ -186,7 +250,7 @@ async def invoke( """Invoke a Galileo Luna scorer. Args: - metric: Preset, registered, or fine-tuned scorer name. + metric: Preset, registered, or fine-tuned scorer label. input: Optional user/system prompt text. output: Optional model response text. project_id: Optional Galileo project UUID for project-scoped scorer resolution. @@ -215,8 +279,7 @@ async def invoke( luna_model=luna_model, config=config, ).to_dict() - request_headers = dict(headers or {}) - endpoint = f"{self.api_base}/scorers/invoke" + endpoint, request_headers = self._endpoint_and_headers(project_id, headers) logger.debug("[GalileoLunaClient] POST %s", endpoint) logger.debug("[GalileoLunaClient] Request body: %s", request_body) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py index 16a39930..f628cd8e 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -101,12 +101,18 @@ def __init__(self, config: LunaEvaluatorConfig) -> None: config: Validated LunaEvaluatorConfig instance. Raises: - ValueError: If GALILEO_API_KEY is not set. + ValueError: If neither GALILEO_API_SECRET_KEY nor GALILEO_API_KEY is set. """ - if not os.getenv("GALILEO_API_KEY"): + has_auth = ( + os.getenv("GALILEO_API_SECRET_KEY") + or os.getenv("GALILEO_API_SECRET") + or os.getenv("GALILEO_API_KEY") + ) + if not has_auth: raise ValueError( - "GALILEO_API_KEY environment variable must be set. " - "Set it to a Galileo API key before using galileo.luna." + "GALILEO_API_SECRET_KEY or GALILEO_API_KEY environment variable must be set. " + "Set an API secret for internal auth or a Galileo API key before using " + "galileo.luna." ) super().__init__(config) diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 1b7e700e..53cf58ae 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -4,6 +4,7 @@ import json import os +from base64 import urlsafe_b64decode from unittest.mock import AsyncMock, patch import httpx @@ -12,6 +13,12 @@ from pydantic import ValidationError +def _decode_jwt_payload(token: str) -> dict[str, object]: + payload_segment = token.split(".")[1] + padded = payload_segment + ("=" * (-len(payload_segment) % 4)) + return json.loads(urlsafe_b64decode(padded.encode()).decode()) + + class TestLunaEvaluatorConfig: """Tests for direct Luna evaluator configuration.""" @@ -96,7 +103,7 @@ def handler(request: httpx.Request) -> httpx.Response: return httpx.Response( 200, json={ - "metric": "toxicity", + "scorer_label": "toxicity", "score": 0.82, "status": "success", "execution_time": 0.12, @@ -133,7 +140,7 @@ def handler(request: httpx.Request) -> httpx.Response: assert captured["body"] == { "input": "user prompt", "output": "model answer", - "metric": "toxicity", + "scorer_label": "toxicity", "project_id": "12345678-1234-5678-1234-567812345678", "luna_model": "luna-2", "config": {"top_k": 1}, @@ -144,6 +151,72 @@ def handler(request: httpx.Request) -> httpx.Response: assert isinstance(headers, dict) assert headers["galileo-api-key"] == "test-key" + @pytest.mark.asyncio + async def test_client_uses_internal_jwt_when_api_secret_is_set(self) -> None: + from agent_control_evaluator_galileo.luna import GalileoLunaClient + + captured: dict[str, object] = {} + + def handler(request: httpx.Request) -> httpx.Response: + captured["url"] = str(request.url) + captured["headers"] = dict(request.headers) + captured["body"] = json.loads(request.content.decode()) + return httpx.Response( + 200, + json={ + "scorer_label": "toxicity", + "score": 0.82, + "status": "success", + "execution_time": 0.12, + }, + ) + + # Given: a Luna client configured with the Galileo API internal secret + with patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True): + client = GalileoLunaClient(api_url="https://api.default.svc.cluster.local:8088") + client._client = httpx.AsyncClient(transport=httpx.MockTransport(handler)) + + try: + # When: invoking a scorer with project context + response = await client.invoke( + metric="toxicity", + output="model answer", + project_id="12345678-1234-5678-1234-567812345678", + ) + finally: + await client.close() + + # Then: the internal scorer endpoint is called with a project-bound JWT + assert response.score == 0.82 + assert captured["url"] == "https://api.default.svc.cluster.local:8088/internal/scorers/invoke" + assert captured["body"] == { + "output": "model answer", + "scorer_label": "toxicity", + "project_id": "12345678-1234-5678-1234-567812345678", + } + headers = captured["headers"] + assert isinstance(headers, dict) + assert "galileo-api-key" not in headers + auth_header = headers["authorization"] + assert isinstance(auth_header, str) + assert auth_header.startswith("Bearer ") + token_payload = _decode_jwt_payload(auth_header.removeprefix("Bearer ")) + assert token_payload["internal"] is True + assert token_payload["project_id"] == "12345678-1234-5678-1234-567812345678" + assert token_payload["scope"] == "scorers.invoke" + + @pytest.mark.asyncio + async def test_client_requires_project_id_for_internal_jwt(self) -> None: + from agent_control_evaluator_galileo.luna import GalileoLunaClient + + # Given: a Luna client configured with internal JWT auth + with patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True): + client = GalileoLunaClient(api_url="https://api.default.svc.cluster.local:8088") + + # When/Then: project_id is required because API uses it as the internal auth context + with pytest.raises(ValueError, match="project_id is required"): + await client.invoke(metric="toxicity", output="model answer") + class TestLunaEvaluator: """Tests for direct Luna evaluator behavior.""" @@ -156,12 +229,26 @@ def test_evaluator_metadata(self) -> None: assert LunaEvaluator.metadata.requires_api_key is True @patch.dict(os.environ, {}, clear=True) - def test_evaluator_init_without_api_key_raises(self) -> None: + def test_evaluator_init_without_auth_raises(self) -> None: from agent_control_evaluator_galileo.luna import LunaEvaluator - with pytest.raises(ValueError, match="GALILEO_API_KEY"): + with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY or GALILEO_API_KEY"): LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5}) + @patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True) + def test_evaluator_init_accepts_api_secret(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluator + + evaluator = LunaEvaluator.from_dict( + { + "metric": "toxicity", + "project_id": "12345678-1234-5678-1234-567812345678", + "threshold": 0.5, + } + ) + + assert str(evaluator.config.project_id) == "12345678-1234-5678-1234-567812345678" + @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) @pytest.mark.asyncio async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: From dd252be06b80c464b9c13929af166dd669cf235d Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Tue, 12 May 2026 10:49:41 -0700 Subject: [PATCH 04/20] add auth and update schema --- .../luna/client.py | 53 +++++++++---------- .../luna/config.py | 2 - .../luna/evaluator.py | 1 - .../galileo/tests/test_luna_evaluator.py | 35 +++++++++--- 4 files changed, 55 insertions(+), 36 deletions(-) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index e75b74bf..6786c5e8 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -10,10 +10,12 @@ from hmac import new as hmac_new from json import dumps from time import time +from typing import Literal from uuid import UUID import httpx from agent_control_models import JSONObject, JSONValue +from pydantic import BaseModel, Field, model_validator logger = logging.getLogger(__name__) @@ -65,40 +67,37 @@ def _as_float_or_none(value: JSONValue) -> float | None: return None -@dataclass(frozen=True) -class ScorerInvokeRequest: +ScorerStepType = Literal["session", "trace", "span"] + + +class ScorerInvokeRequest(BaseModel): """Request payload for Galileo Luna scorer invocation. Attributes: - metric: Preset, registered, or fine-tuned scorer label. + step_type: Runtime step shape used by Galileo scorer input normalization. input: Optional user/system prompt text. output: Optional model response text. - luna_model: Optional Luna model override. + scorer_label: Preset, registered, or fine-tuned scorer label. project_id: Optional Galileo project UUID for project-scoped scorer resolution. config: Optional scorer-specific configuration. """ - metric: str - input: str | None = None - output: str | None = None + step_type: ScorerStepType = Field(default="span") + input: JSONValue = None + output: JSONValue = None + scorer_label: str = Field(min_length=1) project_id: str | UUID | None = None - luna_model: str | None = None config: JSONObject | None = None + @model_validator(mode="after") + def ensure_input_or_output(self) -> ScorerInvokeRequest: + if self.input is None and self.output is None: + raise ValueError("Either input or output must be set.") + return self + def to_dict(self) -> JSONObject: - """Convert to the public API request shape.""" - body: JSONObject = {"scorer_label": self.metric} - if self.input is not None: - body["input"] = self.input - if self.output is not None: - body["output"] = self.output - if self.project_id is not None: - body["project_id"] = str(self.project_id) - if self.luna_model is not None: - body["luna_model"] = self.luna_model - if self.config is not None: - body["config"] = self.config - return body + """Convert to the Galileo scorer invoke API request shape.""" + return self.model_dump(mode="json", exclude_none=True) @dataclass @@ -239,10 +238,10 @@ async def invoke( self, *, metric: str, - input: str | None = None, - output: str | None = None, + input: JSONValue = None, + output: JSONValue = None, + step_type: ScorerStepType = "span", project_id: str | UUID | None = None, - luna_model: str | None = None, config: JSONObject | None = None, timeout: float = DEFAULT_TIMEOUT_SECS, headers: dict[str, str] | None = None, @@ -253,8 +252,8 @@ async def invoke( metric: Preset, registered, or fine-tuned scorer label. input: Optional user/system prompt text. output: Optional model response text. + step_type: Runtime step shape used by Galileo scorer input normalization. project_id: Optional Galileo project UUID for project-scoped scorer resolution. - luna_model: Optional Luna model override. config: Optional scorer-specific configuration. timeout: Request timeout in seconds. headers: Additional request headers. @@ -272,11 +271,11 @@ async def invoke( raise ValueError("At least one of input or output must be provided.") request_body = ScorerInvokeRequest( - metric=metric, + scorer_label=metric, input=input, output=output, + step_type=step_type, project_id=project_id, - luna_model=luna_model, config=config, ).to_dict() endpoint, request_headers = self._endpoint_and_headers(project_id, headers) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py index 241e040f..3bcc34a3 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py @@ -36,7 +36,6 @@ class LunaEvaluatorConfig(EvaluatorConfig): project_id: Optional Galileo project UUID for project-scoped scorer resolution. threshold: Local threshold used by the evaluator for comparison. operator: Local comparison operator. Numeric operators use threshold as a number. - luna_model: Optional Luna model override sent to Galileo. scorer_config: Optional scorer-specific config sent as ``config``. timeout_ms: Request timeout in milliseconds. on_error: Error policy: allow=fail open, deny=fail closed. @@ -58,7 +57,6 @@ class LunaEvaluatorConfig(EvaluatorConfig): default="gte", description="Local comparison operator applied to the raw Luna score.", ) - luna_model: str | None = Field(default=None, description="Optional Luna model override") scorer_config: JSONObject | None = Field( default=None, alias="config", diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py index f628cd8e..8afea45d 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -199,7 +199,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult: input=input_text if _has_text(input_text) else None, output=output_text if _has_text(output_text) else None, project_id=self.config.project_id, - luna_model=self.config.luna_model, config=self.config.scorer_config, timeout=self.get_timeout_seconds(), ) diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 53cf58ae..58bd201b 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -31,7 +31,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None: project_id="12345678-1234-5678-1234-567812345678", threshold=0.7, operator="gte", - luna_model="luna-2", config={"temperature": 0}, ) @@ -40,7 +39,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None: assert str(config.project_id) == "12345678-1234-5678-1234-567812345678" assert config.threshold == 0.7 assert config.operator == "gte" - assert config.luna_model == "luna-2" assert config.scorer_config == {"temperature": 0} def test_numeric_operator_requires_numeric_threshold(self) -> None: @@ -54,6 +52,33 @@ def test_numeric_operator_requires_numeric_threshold(self) -> None: class TestGalileoLunaClient: """Tests for the GalileoLunaClient HTTP contract.""" + def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None: + from agent_control_evaluator_galileo.luna import ScorerInvokeRequest + + # Given: a scorer request with project context and scorer config + request = ScorerInvokeRequest( + scorer_label="toxicity", + input={"messages": [{"role": "user", "content": "hello"}]}, + project_id="12345678-1234-5678-1234-567812345678", + config={"top_k": 1}, + ) + + # Then: the serialized payload uses the Orbit scorer invoke fields + assert request.to_dict() == { + "step_type": "span", + "input": {"messages": [{"role": "user", "content": "hello"}]}, + "scorer_label": "toxicity", + "project_id": "12345678-1234-5678-1234-567812345678", + "config": {"top_k": 1}, + } + + def test_scorer_invoke_request_requires_input_or_output(self) -> None: + from agent_control_evaluator_galileo.luna import ScorerInvokeRequest + + # Given/When/Then: the request mirrors Orbit validation + with pytest.raises(ValidationError, match="Either input or output must be set"): + ScorerInvokeRequest(scorer_label="toxicity") + def test_client_uses_protect_api_url_derivation(self) -> None: from agent_control_evaluator_galileo.luna import GalileoLunaClient @@ -128,7 +153,6 @@ def handler(request: httpx.Request) -> httpx.Response: input="user prompt", output="model answer", project_id="12345678-1234-5678-1234-567812345678", - luna_model="luna-2", config={"top_k": 1}, ) finally: @@ -142,7 +166,7 @@ def handler(request: httpx.Request) -> httpx.Response: "output": "model answer", "scorer_label": "toxicity", "project_id": "12345678-1234-5678-1234-567812345678", - "luna_model": "luna-2", + "step_type": "span", "config": {"top_k": 1}, } assert "stage_name" not in captured["body"] @@ -193,6 +217,7 @@ def handler(request: httpx.Request) -> httpx.Response: "output": "model answer", "scorer_label": "toxicity", "project_id": "12345678-1234-5678-1234-567812345678", + "step_type": "span", } headers = captured["headers"] assert isinstance(headers, dict) @@ -301,7 +326,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: input="user prompt", output="model answer", project_id=evaluator.config.project_id, - luna_model=None, config=None, timeout=5.0, ) @@ -335,7 +359,6 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None: input="hello", output=None, project_id=None, - luna_model=None, config=None, timeout=10.0, ) From 74fcbeb4ce6fd91d3c861daf2b60f6d9e1ffe297 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Tue, 12 May 2026 11:11:57 -0700 Subject: [PATCH 05/20] fix(galileo): align luna scorer response schema --- .../luna/client.py | 44 +++++++++++-------- .../luna/evaluator.py | 2 +- .../galileo/tests/test_luna_evaluator.py | 42 +++++++++++++++++- 3 files changed, 66 insertions(+), 22 deletions(-) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index 6786c5e8..effc132a 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -5,7 +5,6 @@ import logging import os from base64 import urlsafe_b64encode -from dataclasses import dataclass, field from hashlib import sha256 from hmac import new as hmac_new from json import dumps @@ -15,7 +14,7 @@ import httpx from agent_control_models import JSONObject, JSONValue -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, Field, PrivateAttr, model_validator logger = logging.getLogger(__name__) @@ -100,41 +99,48 @@ def to_dict(self) -> JSONObject: return self.model_dump(mode="json", exclude_none=True) -@dataclass -class ScorerInvokeResponse: +class ScorerInvokeResponse(BaseModel): """Response from Galileo Luna scorer invocation. Attributes: - metric: Echoed scorer metric. + scorer_label: Echoed scorer label. score: Raw scorer value. status: Invocation status. execution_time: Execution time in seconds, when returned. error_message: Error detail for non-success statuses. - raw_response: Full response body for diagnostics. """ - metric: str + scorer_label: str score: JSONValue status: str = "unknown" execution_time: float | None = None error_message: str | None = None - raw_response: JSONObject = field(default_factory=dict) + _raw_response: JSONObject = PrivateAttr(default_factory=dict) + + @model_validator(mode="before") + @classmethod + def allow_legacy_metric_response(cls, data: object) -> object: + if isinstance(data, dict) and "scorer_label" not in data and "metric" in data: + return data | {"scorer_label": data["metric"]} + return data + + @property + def metric(self) -> str: + """Backward-compatible alias for existing evaluator metadata code.""" + return self.scorer_label + + @property + def raw_response(self) -> JSONObject: + return self._raw_response @classmethod def from_dict(cls, data: JSONObject) -> ScorerInvokeResponse: """Create a response model from the API JSON object.""" - metric_value = data.get("scorer_label", data.get("metric", "")) - status_value = data.get("status", "unknown") - error_value = data.get("error_message") - - return cls( - metric=str(metric_value) if metric_value is not None else "", - score=data.get("score"), - status=str(status_value) if status_value is not None else "unknown", - execution_time=_as_float_or_none(data.get("execution_time")), - error_message=str(error_value) if error_value is not None else None, - raw_response=data, + response = cls.model_validate( + data | {"execution_time": _as_float_or_none(data.get("execution_time"))} ) + response._raw_response = data + return response class GalileoLunaClient: diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py index 8afea45d..9db2f60d 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -227,7 +227,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]: metadata: dict[str, Any] = { - "metric": response.metric or self.config.metric, + "metric": response.scorer_label or self.config.metric, "project_id": str(self.config.project_id) if self.config.project_id else None, "score": response.score, "threshold": self.config.threshold, diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 58bd201b..de9da5af 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -79,6 +79,44 @@ def test_scorer_invoke_request_requires_input_or_output(self) -> None: with pytest.raises(ValidationError, match="Either input or output must be set"): ScorerInvokeRequest(scorer_label="toxicity") + def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None: + from agent_control_evaluator_galileo.luna import ScorerInvokeResponse + + # Given: an API scorer invoke response + response = ScorerInvokeResponse.from_dict( + { + "scorer_label": "toxicity", + "score": 0.82, + "status": "success", + "execution_time": 0.12, + "error_message": None, + } + ) + + # Then: the model exposes the Orbit/API response fields + assert response.model_dump() == { + "scorer_label": "toxicity", + "score": 0.82, + "status": "success", + "execution_time": 0.12, + "error_message": None, + } + assert response.scorer_label == "toxicity" + assert response.metric == "toxicity" + assert response.raw_response["scorer_label"] == "toxicity" + + def test_scorer_invoke_response_accepts_legacy_metric_field(self) -> None: + from agent_control_evaluator_galileo.luna import ScorerInvokeResponse + + # Given/When: an older API response uses metric instead of scorer_label + response = ScorerInvokeResponse.from_dict( + {"metric": "toxicity", "score": 0.82, "status": "success"} + ) + + # Then: the client still normalizes it to the current response contract + assert response.scorer_label == "toxicity" + assert response.model_dump()["scorer_label"] == "toxicity" + def test_client_uses_protect_api_url_derivation(self) -> None: from agent_control_evaluator_galileo.luna import GalileoLunaClient @@ -293,7 +331,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: mock_invoke.return_value = ScorerInvokeResponse( - metric="toxicity", + scorer_label="toxicity", score=0.82, status="success", execution_time=0.1, @@ -343,7 +381,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None: with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: mock_invoke.return_value = ScorerInvokeResponse( - metric="toxicity", + scorer_label="toxicity", score=0.2, status="success", ) From 7b0a15d2b6d8b8a98a38d311c4818016e92ae394 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Wed, 13 May 2026 12:01:56 -0700 Subject: [PATCH 06/20] update the schemas and corresponding tests --- .../luna/client.py | 30 ++++------- .../luna/config.py | 6 +-- .../luna/evaluator.py | 10 ++-- .../galileo/tests/test_luna_evaluator.py | 51 +++++++------------ examples/galileo_luna/README.md | 2 +- examples/galileo_luna/setup_controls.py | 6 +-- 6 files changed, 40 insertions(+), 65 deletions(-) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index effc132a..426b1782 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -66,14 +66,14 @@ def _as_float_or_none(value: JSONValue) -> float | None: return None -ScorerStepType = Literal["session", "trace", "span"] +RootType = Literal["session", "trace", "span"] class ScorerInvokeRequest(BaseModel): """Request payload for Galileo Luna scorer invocation. Attributes: - step_type: Runtime step shape used by Galileo scorer input normalization. + root_type: Runtime step shape used by Galileo scorer input normalization. input: Optional user/system prompt text. output: Optional model response text. scorer_label: Preset, registered, or fine-tuned scorer label. @@ -81,7 +81,7 @@ class ScorerInvokeRequest(BaseModel): config: Optional scorer-specific configuration. """ - step_type: ScorerStepType = Field(default="span") + root_type: RootType = Field(default="span") input: JSONValue = None output: JSONValue = None scorer_label: str = Field(min_length=1) @@ -117,18 +117,6 @@ class ScorerInvokeResponse(BaseModel): error_message: str | None = None _raw_response: JSONObject = PrivateAttr(default_factory=dict) - @model_validator(mode="before") - @classmethod - def allow_legacy_metric_response(cls, data: object) -> object: - if isinstance(data, dict) and "scorer_label" not in data and "metric" in data: - return data | {"scorer_label": data["metric"]} - return data - - @property - def metric(self) -> str: - """Backward-compatible alias for existing evaluator metadata code.""" - return self.scorer_label - @property def raw_response(self) -> JSONObject: return self._raw_response @@ -243,10 +231,10 @@ def _endpoint_and_headers( async def invoke( self, *, - metric: str, + scorer_label: str, input: JSONValue = None, output: JSONValue = None, - step_type: ScorerStepType = "span", + root_type: RootType = "span", project_id: str | UUID | None = None, config: JSONObject | None = None, timeout: float = DEFAULT_TIMEOUT_SECS, @@ -255,10 +243,10 @@ async def invoke( """Invoke a Galileo Luna scorer. Args: - metric: Preset, registered, or fine-tuned scorer label. + scorer_label: Preset, registered, or fine-tuned scorer label. input: Optional user/system prompt text. output: Optional model response text. - step_type: Runtime step shape used by Galileo scorer input normalization. + root_type: Runtime step shape used by Galileo scorer input normalization. project_id: Optional Galileo project UUID for project-scoped scorer resolution. config: Optional scorer-specific configuration. timeout: Request timeout in seconds. @@ -277,10 +265,10 @@ async def invoke( raise ValueError("At least one of input or output must be provided.") request_body = ScorerInvokeRequest( - scorer_label=metric, + scorer_label=scorer_label, input=input, output=output, - step_type=step_type, + root_type=root_type, project_id=project_id, config=config, ).to_dict() diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py index 3bcc34a3..1e41a554 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py @@ -32,7 +32,7 @@ class LunaEvaluatorConfig(EvaluatorConfig): """Configuration for direct Luna scorer evaluation. Attributes: - metric: Preset, registered, or fine-tuned scorer name. + scorer_label: Preset, registered, or fine-tuned scorer label. project_id: Optional Galileo project UUID for project-scoped scorer resolution. threshold: Local threshold used by the evaluator for comparison. operator: Local comparison operator. Numeric operators use threshold as a number. @@ -40,11 +40,11 @@ class LunaEvaluatorConfig(EvaluatorConfig): timeout_ms: Request timeout in milliseconds. on_error: Error policy: allow=fail open, deny=fail closed. payload_field: Force selected data into input or output. If omitted, root step - payloads with input/output use both fields; scalar data is inferred from metric name. + payloads with input/output use both fields; scalar data is inferred from scorer label. include_raw_response: Include the raw API response in EvaluatorResult metadata. """ - metric: str = Field(..., min_length=1, description="Luna metric/scorer name to evaluate") + scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke") project_id: UUID | None = Field( default=None, description="Optional Galileo project UUID for project-scoped scorer resolution.", diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py index 9db2f60d..a5b3f248 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -139,7 +139,7 @@ def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]: return input_text, output_text text = _coerce_payload_text(data) - if "output" in self.config.metric: + if "output" in self.config.scorer_label: return None, text return text, None @@ -190,12 +190,12 @@ async def evaluate(self, data: Any) -> EvaluatorResult: matched=False, confidence=1.0, message="No data to score with Luna", - metadata={"metric": self.config.metric}, + metadata={"scorer_label": self.config.scorer_label}, ) try: response = await self._get_client().invoke( - metric=self.config.metric, + scorer_label=self.config.scorer_label, input=input_text if _has_text(input_text) else None, output=output_text if _has_text(output_text) else None, project_id=self.config.project_id, @@ -227,7 +227,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]: metadata: dict[str, Any] = { - "metric": response.scorer_label or self.config.metric, + "scorer_label": response.scorer_label or self.config.scorer_label, "project_id": str(self.config.project_id) if self.config.project_id else None, "score": response.score, "threshold": self.config.threshold, @@ -251,7 +251,7 @@ def _handle_error(self, error: Exception) -> EvaluatorResult: metadata={ "error": error_detail, "error_type": type(error).__name__, - "metric": self.config.metric, + "scorer_label": self.config.scorer_label, "fallback_action": fallback, }, error=None if matched else error_detail, diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index de9da5af..31323a42 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -27,7 +27,7 @@ def test_config_accepts_direct_scorer_fields(self) -> None: # Given: a direct scorer config with local thresholding config = LunaEvaluatorConfig( - metric="toxicity", + scorer_label="toxicity", project_id="12345678-1234-5678-1234-567812345678", threshold=0.7, operator="gte", @@ -35,7 +35,7 @@ def test_config_accepts_direct_scorer_fields(self) -> None: ) # Then: config is retained without Protect concepts - assert config.metric == "toxicity" + assert config.scorer_label == "toxicity" assert str(config.project_id) == "12345678-1234-5678-1234-567812345678" assert config.threshold == 0.7 assert config.operator == "gte" @@ -46,7 +46,7 @@ def test_numeric_operator_requires_numeric_threshold(self) -> None: # Given/When/Then: numeric local comparison rejects non-numeric thresholds with pytest.raises(ValidationError, match="numeric threshold"): - LunaEvaluatorConfig(metric="toxicity", threshold="high", operator="gte") + LunaEvaluatorConfig(scorer_label="toxicity", threshold="high", operator="gte") class TestGalileoLunaClient: @@ -65,7 +65,7 @@ def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None: # Then: the serialized payload uses the Orbit scorer invoke fields assert request.to_dict() == { - "step_type": "span", + "root_type": "span", "input": {"messages": [{"role": "user", "content": "hello"}]}, "scorer_label": "toxicity", "project_id": "12345678-1234-5678-1234-567812345678", @@ -102,21 +102,8 @@ def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None: "error_message": None, } assert response.scorer_label == "toxicity" - assert response.metric == "toxicity" assert response.raw_response["scorer_label"] == "toxicity" - def test_scorer_invoke_response_accepts_legacy_metric_field(self) -> None: - from agent_control_evaluator_galileo.luna import ScorerInvokeResponse - - # Given/When: an older API response uses metric instead of scorer_label - response = ScorerInvokeResponse.from_dict( - {"metric": "toxicity", "score": 0.82, "status": "success"} - ) - - # Then: the client still normalizes it to the current response contract - assert response.scorer_label == "toxicity" - assert response.model_dump()["scorer_label"] == "toxicity" - def test_client_uses_protect_api_url_derivation(self) -> None: from agent_control_evaluator_galileo.luna import GalileoLunaClient @@ -187,7 +174,7 @@ def handler(request: httpx.Request) -> httpx.Response: try: # When: invoking a scorer response = await client.invoke( - metric="toxicity", + scorer_label="toxicity", input="user prompt", output="model answer", project_id="12345678-1234-5678-1234-567812345678", @@ -204,7 +191,7 @@ def handler(request: httpx.Request) -> httpx.Response: "output": "model answer", "scorer_label": "toxicity", "project_id": "12345678-1234-5678-1234-567812345678", - "step_type": "span", + "root_type": "span", "config": {"top_k": 1}, } assert "stage_name" not in captured["body"] @@ -241,7 +228,7 @@ def handler(request: httpx.Request) -> httpx.Response: try: # When: invoking a scorer with project context response = await client.invoke( - metric="toxicity", + scorer_label="toxicity", output="model answer", project_id="12345678-1234-5678-1234-567812345678", ) @@ -255,7 +242,7 @@ def handler(request: httpx.Request) -> httpx.Response: "output": "model answer", "scorer_label": "toxicity", "project_id": "12345678-1234-5678-1234-567812345678", - "step_type": "span", + "root_type": "span", } headers = captured["headers"] assert isinstance(headers, dict) @@ -278,7 +265,7 @@ async def test_client_requires_project_id_for_internal_jwt(self) -> None: # When/Then: project_id is required because API uses it as the internal auth context with pytest.raises(ValueError, match="project_id is required"): - await client.invoke(metric="toxicity", output="model answer") + await client.invoke(scorer_label="toxicity", output="model answer") class TestLunaEvaluator: @@ -296,7 +283,7 @@ def test_evaluator_init_without_auth_raises(self) -> None: from agent_control_evaluator_galileo.luna import LunaEvaluator with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY or GALILEO_API_KEY"): - LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5}) + LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5}) @patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True) def test_evaluator_init_accepts_api_secret(self) -> None: @@ -304,7 +291,7 @@ def test_evaluator_init_accepts_api_secret(self) -> None: evaluator = LunaEvaluator.from_dict( { - "metric": "toxicity", + "scorer_label": "toxicity", "project_id": "12345678-1234-5678-1234-567812345678", "threshold": 0.5, } @@ -321,7 +308,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: # Given: a direct Luna evaluator and a raw successful scorer response evaluator = LunaEvaluator.from_dict( { - "metric": "toxicity", + "scorer_label": "toxicity", "project_id": "12345678-1234-5678-1234-567812345678", "threshold": 0.7, "operator": "gte", @@ -350,7 +337,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: assert result.matched is True assert result.confidence == 0.82 assert result.metadata == { - "metric": "toxicity", + "scorer_label": "toxicity", "project_id": "12345678-1234-5678-1234-567812345678", "score": 0.82, "threshold": 0.7, @@ -360,7 +347,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: "error_message": None, } mock_invoke.assert_awaited_once_with( - metric="toxicity", + scorer_label="toxicity", input="user prompt", output="model answer", project_id=evaluator.config.project_id, @@ -376,7 +363,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None: # Given: a raw scorer value below the local threshold evaluator = LunaEvaluator.from_dict( - {"metric": "toxicity", "threshold": 0.7, "operator": "gte"} + {"scorer_label": "toxicity", "threshold": 0.7, "operator": "gte"} ) with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: @@ -393,7 +380,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None: assert result.matched is False assert result.confidence == 0.2 mock_invoke.assert_awaited_once_with( - metric="toxicity", + scorer_label="toxicity", input="hello", output=None, project_id=None, @@ -408,7 +395,7 @@ async def test_evaluator_does_not_call_api_for_empty_data(self) -> None: from agent_control_evaluator_galileo.luna.client import GalileoLunaClient # Given: an evaluator and empty selected data - evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5}) + evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5}) with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: # When: evaluating empty data @@ -427,7 +414,7 @@ async def test_evaluator_fail_open_sets_error(self) -> None: from agent_control_evaluator_galileo.luna.client import GalileoLunaClient # Given: default fail-open behavior - evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5}) + evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5}) with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: mock_invoke.side_effect = RuntimeError("service unavailable") @@ -449,7 +436,7 @@ async def test_evaluator_fail_closed_matches_without_error_field(self) -> None: # Given: fail-closed behavior for scorer errors evaluator = LunaEvaluator.from_dict( - {"metric": "toxicity", "threshold": 0.5, "on_error": "deny"} + {"scorer_label": "toxicity", "threshold": 0.5, "on_error": "deny"} ) with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: diff --git a/examples/galileo_luna/README.md b/examples/galileo_luna/README.md index d43a2d71..534ef640 100644 --- a/examples/galileo_luna/README.md +++ b/examples/galileo_luna/README.md @@ -33,7 +33,7 @@ export GALILEO_PROJECT_ID="00000000-0000-0000-0000-000000000000" Optional scorer settings: ```bash -export GALILEO_LUNA_METRIC="toxicity" +export GALILEO_LUNA_SCORER_LABEL="toxicity" export GALILEO_LUNA_THRESHOLD="0.5" ``` diff --git a/examples/galileo_luna/setup_controls.py b/examples/galileo_luna/setup_controls.py index 3d325cde..69a36ad5 100644 --- a/examples/galileo_luna/setup_controls.py +++ b/examples/galileo_luna/setup_controls.py @@ -23,7 +23,7 @@ AGENT_DESCRIPTION = "Demo agent protected by direct Galileo Luna scorer controls" SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000") -LUNA_METRIC = os.getenv("GALILEO_LUNA_METRIC", "toxicity") +LUNA_SCORER_LABEL = os.getenv("GALILEO_LUNA_SCORER_LABEL", "toxicity") LUNA_THRESHOLD = float(os.getenv("GALILEO_LUNA_THRESHOLD", "0.5")) GALILEO_PROJECT_ID = os.getenv("GALILEO_PROJECT_ID") @@ -41,7 +41,7 @@ def luna_config() -> dict[str, Any]: """Build the direct Luna evaluator config used by the composite control.""" config: dict[str, Any] = { - "metric": LUNA_METRIC, + "scorer_label": LUNA_SCORER_LABEL, "threshold": LUNA_THRESHOLD, "operator": "gte", "payload_field": "output", @@ -158,7 +158,7 @@ async def setup_demo() -> None: print("Setting up direct Galileo Luna demo controls") print(f"Server: {SERVER_URL}") print(f"Agent: {AGENT_NAME}") - print(f"Luna: metric={LUNA_METRIC!r}, threshold={LUNA_THRESHOLD}") + print(f"Luna: scorer_label={LUNA_SCORER_LABEL!r}, threshold={LUNA_THRESHOLD}") if GALILEO_PROJECT_ID: print(f"Project ID: {GALILEO_PROJECT_ID}") From 523524d07fb9837fa574106fe6346a07f25e25be Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Wed, 13 May 2026 17:37:14 -0700 Subject: [PATCH 07/20] update the schemas for scorer --- .../luna/__init__.py | 2 + .../luna/client.py | 33 ++++++++-------- .../galileo/tests/test_luna_evaluator.py | 37 +++++++++--------- .../src/agent_control/evaluators/__init__.py | 38 +++++++++++-------- 4 files changed, 62 insertions(+), 48 deletions(-) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py index c3ff0375..b26feaac 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py @@ -2,6 +2,7 @@ from agent_control_evaluator_galileo.luna.client import ( GalileoLunaClient, + ScorerInvokeInputs, ScorerInvokeRequest, ScorerInvokeResponse, ) @@ -10,6 +11,7 @@ __all__ = [ "GalileoLunaClient", + "ScorerInvokeInputs", "ScorerInvokeRequest", "ScorerInvokeResponse", "LunaEvaluatorConfig", diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index 426b1782..a2ccdc3f 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -9,7 +9,6 @@ from hmac import new as hmac_new from json import dumps from time import time -from typing import Literal from uuid import UUID import httpx @@ -66,32 +65,38 @@ def _as_float_or_none(value: JSONValue) -> float | None: return None -RootType = Literal["session", "trace", "span"] +def _has_value(value: JSONValue) -> bool: + return value is not None and value != "" + + +class ScorerInvokeInputs(BaseModel): + """Input values sent to Galileo's scorer invoke API.""" + + query: JSONValue = "" + response: JSONValue = "" + ground_truth: JSONValue = None + tools: JSONValue = None class ScorerInvokeRequest(BaseModel): """Request payload for Galileo Luna scorer invocation. Attributes: - root_type: Runtime step shape used by Galileo scorer input normalization. - input: Optional user/system prompt text. - output: Optional model response text. + inputs: Selected scorer input values. scorer_label: Preset, registered, or fine-tuned scorer label. project_id: Optional Galileo project UUID for project-scoped scorer resolution. config: Optional scorer-specific configuration. """ - root_type: RootType = Field(default="span") - input: JSONValue = None - output: JSONValue = None scorer_label: str = Field(min_length=1) + inputs: ScorerInvokeInputs project_id: str | UUID | None = None config: JSONObject | None = None @model_validator(mode="after") def ensure_input_or_output(self) -> ScorerInvokeRequest: - if self.input is None and self.output is None: - raise ValueError("Either input or output must be set.") + if not (_has_value(self.inputs.query) or _has_value(self.inputs.response)): + raise ValueError("Either inputs.query or inputs.response must be set.") return self def to_dict(self) -> JSONObject: @@ -234,7 +239,6 @@ async def invoke( scorer_label: str, input: JSONValue = None, output: JSONValue = None, - root_type: RootType = "span", project_id: str | UUID | None = None, config: JSONObject | None = None, timeout: float = DEFAULT_TIMEOUT_SECS, @@ -246,7 +250,6 @@ async def invoke( scorer_label: Preset, registered, or fine-tuned scorer label. input: Optional user/system prompt text. output: Optional model response text. - root_type: Runtime step shape used by Galileo scorer input normalization. project_id: Optional Galileo project UUID for project-scoped scorer resolution. config: Optional scorer-specific configuration. timeout: Request timeout in seconds. @@ -266,9 +269,9 @@ async def invoke( request_body = ScorerInvokeRequest( scorer_label=scorer_label, - input=input, - output=output, - root_type=root_type, + inputs=ScorerInvokeInputs( + query="" if input is None else input, response="" if output is None else output + ), project_id=project_id, config=config, ).to_dict() diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 31323a42..9f4ae862 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -52,22 +52,24 @@ def test_numeric_operator_requires_numeric_threshold(self) -> None: class TestGalileoLunaClient: """Tests for the GalileoLunaClient HTTP contract.""" - def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None: - from agent_control_evaluator_galileo.luna import ScorerInvokeRequest + def test_scorer_invoke_request_matches_api_schema_shape(self) -> None: + from agent_control_evaluator_galileo.luna import ScorerInvokeInputs, ScorerInvokeRequest # Given: a scorer request with project context and scorer config request = ScorerInvokeRequest( scorer_label="toxicity", - input={"messages": [{"role": "user", "content": "hello"}]}, + inputs=ScorerInvokeInputs(query={"messages": [{"role": "user", "content": "hello"}]}), project_id="12345678-1234-5678-1234-567812345678", config={"top_k": 1}, ) - # Then: the serialized payload uses the Orbit scorer invoke fields + # Then: the serialized payload uses the API-owned scorer invoke fields assert request.to_dict() == { - "root_type": "span", - "input": {"messages": [{"role": "user", "content": "hello"}]}, "scorer_label": "toxicity", + "inputs": { + "query": {"messages": [{"role": "user", "content": "hello"}]}, + "response": "", + }, "project_id": "12345678-1234-5678-1234-567812345678", "config": {"top_k": 1}, } @@ -75,11 +77,13 @@ def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None: def test_scorer_invoke_request_requires_input_or_output(self) -> None: from agent_control_evaluator_galileo.luna import ScorerInvokeRequest - # Given/When/Then: the request mirrors Orbit validation - with pytest.raises(ValidationError, match="Either input or output must be set"): - ScorerInvokeRequest(scorer_label="toxicity") + # Given/When/Then: the request mirrors API validation + with pytest.raises( + ValidationError, match="Either inputs.query or inputs.response must be set" + ): + ScorerInvokeRequest(scorer_label="toxicity", inputs={}) - def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None: + def test_scorer_invoke_response_matches_api_schema_shape(self) -> None: from agent_control_evaluator_galileo.luna import ScorerInvokeResponse # Given: an API scorer invoke response @@ -93,7 +97,7 @@ def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None: } ) - # Then: the model exposes the Orbit/API response fields + # Then: the model exposes the API response fields assert response.model_dump() == { "scorer_label": "toxicity", "score": 0.82, @@ -187,11 +191,9 @@ def handler(request: httpx.Request) -> httpx.Response: assert response.score == 0.82 assert captured["url"] == "https://api.demo-v2.galileocloud.io/scorers/invoke" assert captured["body"] == { - "input": "user prompt", - "output": "model answer", "scorer_label": "toxicity", + "inputs": {"query": "user prompt", "response": "model answer"}, "project_id": "12345678-1234-5678-1234-567812345678", - "root_type": "span", "config": {"top_k": 1}, } assert "stage_name" not in captured["body"] @@ -237,12 +239,13 @@ def handler(request: httpx.Request) -> httpx.Response: # Then: the internal scorer endpoint is called with a project-bound JWT assert response.score == 0.82 - assert captured["url"] == "https://api.default.svc.cluster.local:8088/internal/scorers/invoke" + assert ( + captured["url"] == "https://api.default.svc.cluster.local:8088/internal/scorers/invoke" + ) assert captured["body"] == { - "output": "model answer", "scorer_label": "toxicity", + "inputs": {"query": "", "response": "model answer"}, "project_id": "12345678-1234-5678-1234-567812345678", - "root_type": "span", } headers = captured["headers"] assert isinstance(headers, dict) diff --git a/sdks/python/src/agent_control/evaluators/__init__.py b/sdks/python/src/agent_control/evaluators/__init__.py index 9fd87e71..8366a107 100644 --- a/sdks/python/src/agent_control/evaluators/__init__.py +++ b/sdks/python/src/agent_control/evaluators/__init__.py @@ -44,19 +44,23 @@ LunaEvaluator, LunaEvaluatorConfig, LunaOperator, + ScorerInvokeInputs, ScorerInvokeRequest, ScorerInvokeResponse, ) - __all__.extend([ - "GalileoLunaClient", - "ScorerInvokeRequest", - "ScorerInvokeResponse", - "LunaEvaluator", - "LunaEvaluatorConfig", - "LunaOperator", - "LUNA_AVAILABLE", - ]) + __all__.extend( + [ + "GalileoLunaClient", + "ScorerInvokeInputs", + "ScorerInvokeRequest", + "ScorerInvokeResponse", + "LunaEvaluator", + "LunaEvaluatorConfig", + "LunaOperator", + "LUNA_AVAILABLE", + ] + ) except ImportError: pass @@ -69,12 +73,14 @@ Luna2Operator, ) - __all__.extend([ - "Luna2Evaluator", - "Luna2EvaluatorConfig", - "Luna2Metric", - "Luna2Operator", - "LUNA2_AVAILABLE", - ]) + __all__.extend( + [ + "Luna2Evaluator", + "Luna2EvaluatorConfig", + "Luna2Metric", + "Luna2Operator", + "LUNA2_AVAILABLE", + ] + ) except ImportError: pass From 34f430df0b8934a670286ea4c9712254fd35e748 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Wed, 13 May 2026 21:56:33 -0700 Subject: [PATCH 08/20] update luna client schemas --- .../luna/client.py | 10 +++++++-- .../galileo/tests/test_luna_evaluator.py | 21 +++++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index a2ccdc3f..86033339 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -66,7 +66,13 @@ def _as_float_or_none(value: JSONValue) -> float | None: def _has_value(value: JSONValue) -> bool: - return value is not None and value != "" + if value is None: + return False + if isinstance(value, str): + return value.strip() != "" + if isinstance(value, (list, dict)): + return len(value) > 0 + return True class ScorerInvokeInputs(BaseModel): @@ -264,7 +270,7 @@ async def invoke( httpx.HTTPStatusError: If the API returns an error status code. httpx.RequestError: If the request fails before a response is received. """ - if input is None and output is None: + if not (_has_value(input) or _has_value(output)): raise ValueError("At least one of input or output must be provided.") request_body = ScorerInvokeRequest( diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 9f4ae862..80a5e00b 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -74,14 +74,18 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None: "config": {"top_k": 1}, } - def test_scorer_invoke_request_requires_input_or_output(self) -> None: + @pytest.mark.parametrize("empty_value", ["", " ", {}, []]) + def test_scorer_invoke_request_requires_input_or_output(self, empty_value: object) -> None: from agent_control_evaluator_galileo.luna import ScorerInvokeRequest # Given/When/Then: the request mirrors API validation with pytest.raises( ValidationError, match="Either inputs.query or inputs.response must be set" ): - ScorerInvokeRequest(scorer_label="toxicity", inputs={}) + ScorerInvokeRequest( + scorer_label="toxicity", + inputs={"query": empty_value, "response": empty_value}, + ) def test_scorer_invoke_response_matches_api_schema_shape(self) -> None: from agent_control_evaluator_galileo.luna import ScorerInvokeResponse @@ -270,6 +274,19 @@ async def test_client_requires_project_id_for_internal_jwt(self) -> None: with pytest.raises(ValueError, match="project_id is required"): await client.invoke(scorer_label="toxicity", output="model answer") + @pytest.mark.asyncio + @pytest.mark.parametrize("empty_value", ["", " ", {}, []]) + async def test_client_rejects_missing_input_and_output_values(self, empty_value: object) -> None: + from agent_control_evaluator_galileo.luna import GalileoLunaClient + + # Given: a Luna client and scorer input values that API treats as missing + with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}, clear=True): + client = GalileoLunaClient(api_url="https://api.default.svc.cluster.local:8088") + + # When/Then: the client rejects the request before calling API + with pytest.raises(ValueError, match="At least one of input or output must be provided"): + await client.invoke(scorer_label="toxicity", input=empty_value, output=empty_value) + class TestLunaEvaluator: """Tests for direct Luna evaluator behavior.""" From ad0b2dc98b30fcaffe0c5897cfee08e96de83e03 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Wed, 13 May 2026 21:59:37 -0700 Subject: [PATCH 09/20] fix tests --- evaluators/contrib/galileo/tests/test_luna_evaluator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 80a5e00b..5cf1fcf8 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -276,7 +276,9 @@ async def test_client_requires_project_id_for_internal_jwt(self) -> None: @pytest.mark.asyncio @pytest.mark.parametrize("empty_value", ["", " ", {}, []]) - async def test_client_rejects_missing_input_and_output_values(self, empty_value: object) -> None: + async def test_client_rejects_missing_input_and_output_values( + self, empty_value: object + ) -> None: from agent_control_evaluator_galileo.luna import GalileoLunaClient # Given: a Luna client and scorer input values that API treats as missing From 81cea0471518dd22e0964889412155d5122881de Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Thu, 14 May 2026 15:11:17 -0700 Subject: [PATCH 10/20] remove unwanted fields --- .../luna/config.py | 16 ----------- .../luna/evaluator.py | 15 ++--------- .../galileo/tests/test_luna_evaluator.py | 27 ++----------------- 3 files changed, 4 insertions(+), 54 deletions(-) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py index 1e41a554..7bf5de48 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py @@ -38,10 +38,6 @@ class LunaEvaluatorConfig(EvaluatorConfig): operator: Local comparison operator. Numeric operators use threshold as a number. scorer_config: Optional scorer-specific config sent as ``config``. timeout_ms: Request timeout in milliseconds. - on_error: Error policy: allow=fail open, deny=fail closed. - payload_field: Force selected data into input or output. If omitted, root step - payloads with input/output use both fields; scalar data is inferred from scorer label. - include_raw_response: Include the raw API response in EvaluatorResult metadata. """ scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke") @@ -69,18 +65,6 @@ class LunaEvaluatorConfig(EvaluatorConfig): le=60000, description="Request timeout in milliseconds (1-60 seconds)", ) - on_error: Literal["allow", "deny"] = Field( - default="allow", - description="Action on error: 'allow' (fail open) or 'deny' (fail closed)", - ) - payload_field: Literal["input", "output"] | None = Field( - default=None, - description="Explicitly set which scorer payload field receives scalar selected data.", - ) - include_raw_response: bool = Field( - default=False, - description="Include the raw scorer response in result metadata.", - ) @model_validator(mode="after") def validate_threshold(self) -> LunaEvaluatorConfig: diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py index a5b3f248..f9e0ad0d 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -126,12 +126,6 @@ def _get_client(self) -> GalileoLunaClient: def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]: """Prepare scorer input/output fields from selected data.""" - if self.config.payload_field is not None: - text = _coerce_payload_text(data) - if self.config.payload_field == "output": - return None, text - return text, None - if isinstance(data, dict): input_text = _extract_dict_text(data, "input") output_text = _extract_dict_text(data, "output") @@ -236,25 +230,20 @@ def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]: "execution_time_seconds": response.execution_time, "error_message": response.error_message, } - if self.config.include_raw_response: - metadata["raw_response"] = response.raw_response return metadata def _handle_error(self, error: Exception) -> EvaluatorResult: - fallback = self.config.on_error - matched = fallback == "deny" error_detail = str(error) return EvaluatorResult( - matched=matched, + matched=False, confidence=0.0, message=f"Luna evaluation error: {error_detail}", metadata={ "error": error_detail, "error_type": type(error).__name__, "scorer_label": self.config.scorer_label, - "fallback_action": fallback, }, - error=None if matched else error_detail, + error=error_detail, ) async def aclose(self) -> None: diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 5cf1fcf8..1b0bcef8 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -435,7 +435,7 @@ async def test_evaluator_fail_open_sets_error(self) -> None: from agent_control_evaluator_galileo.luna import LunaEvaluator from agent_control_evaluator_galileo.luna.client import GalileoLunaClient - # Given: default fail-open behavior + # Given: fixed fail-open behavior for scorer errors evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5}) with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: @@ -448,27 +448,4 @@ async def test_evaluator_fail_open_sets_error(self) -> None: assert result.matched is False assert result.error == "service unavailable" assert result.metadata is not None - assert result.metadata["fallback_action"] == "allow" - - @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @pytest.mark.asyncio - async def test_evaluator_fail_closed_matches_without_error_field(self) -> None: - from agent_control_evaluator_galileo.luna import LunaEvaluator - from agent_control_evaluator_galileo.luna.client import GalileoLunaClient - - # Given: fail-closed behavior for scorer errors - evaluator = LunaEvaluator.from_dict( - {"scorer_label": "toxicity", "threshold": 0.5, "on_error": "deny"} - ) - - with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: - mock_invoke.side_effect = RuntimeError("service unavailable") - - # When: the scorer call fails - result = await evaluator.evaluate("hello") - - # Then: the control matches so deny/steer actions can be applied by the engine - assert result.matched is True - assert result.error is None - assert result.metadata is not None - assert result.metadata["fallback_action"] == "deny" + assert "fallback_action" not in result.metadata From f3cf8f72609c599833542c75a0b0c408255e789c Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Thu, 14 May 2026 16:41:04 -0700 Subject: [PATCH 11/20] remove project_id from evaluator config --- .../luna/client.py | 20 +------ .../luna/config.py | 6 -- .../luna/evaluator.py | 2 - .../galileo/tests/test_luna_evaluator.py | 57 ++++++++++--------- 4 files changed, 33 insertions(+), 52 deletions(-) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index 86033339..caca997e 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -9,7 +9,6 @@ from hmac import new as hmac_new from json import dumps from time import time -from uuid import UUID import httpx from agent_control_models import JSONObject, JSONValue @@ -29,7 +28,6 @@ def _b64url(data: bytes) -> str: def _internal_auth_token( api_secret: str, - project_id: str | UUID, ttl_seconds: int = DEFAULT_INTERNAL_TOKEN_TTL_SECS, ) -> str: """Create the internal JWT expected by Galileo API internal routes.""" @@ -37,7 +35,6 @@ def _internal_auth_token( header = {"alg": "HS256", "typ": "JWT"} payload = { "internal": True, - "project_id": str(project_id), "scope": "scorers.invoke", "iat": now, "exp": now + ttl_seconds, @@ -90,13 +87,11 @@ class ScorerInvokeRequest(BaseModel): Attributes: inputs: Selected scorer input values. scorer_label: Preset, registered, or fine-tuned scorer label. - project_id: Optional Galileo project UUID for project-scoped scorer resolution. config: Optional scorer-specific configuration. """ scorer_label: str = Field(min_length=1) inputs: ScorerInvokeInputs - project_id: str | UUID | None = None config: JSONObject | None = None @model_validator(mode="after") @@ -222,21 +217,13 @@ async def _get_client(self) -> httpx.AsyncClient: def _endpoint_and_headers( self, - project_id: str | UUID | None, headers: dict[str, str] | None, ) -> tuple[str, dict[str, str]]: request_headers = dict(headers or {}) if self.api_secret is None: return f"{self.api_base}{PUBLIC_SCORER_INVOKE_PATH}", request_headers - if project_id is None: - raise ValueError( - "project_id is required when using GALILEO_API_SECRET_KEY internal auth." - ) - - request_headers["Authorization"] = ( - f"Bearer {_internal_auth_token(self.api_secret, project_id)}" - ) + request_headers["Authorization"] = f"Bearer {_internal_auth_token(self.api_secret)}" return f"{self.api_base}{INTERNAL_SCORER_INVOKE_PATH}", request_headers async def invoke( @@ -245,7 +232,6 @@ async def invoke( scorer_label: str, input: JSONValue = None, output: JSONValue = None, - project_id: str | UUID | None = None, config: JSONObject | None = None, timeout: float = DEFAULT_TIMEOUT_SECS, headers: dict[str, str] | None = None, @@ -256,7 +242,6 @@ async def invoke( scorer_label: Preset, registered, or fine-tuned scorer label. input: Optional user/system prompt text. output: Optional model response text. - project_id: Optional Galileo project UUID for project-scoped scorer resolution. config: Optional scorer-specific configuration. timeout: Request timeout in seconds. headers: Additional request headers. @@ -278,10 +263,9 @@ async def invoke( inputs=ScorerInvokeInputs( query="" if input is None else input, response="" if output is None else output ), - project_id=project_id, config=config, ).to_dict() - endpoint, request_headers = self._endpoint_and_headers(project_id, headers) + endpoint, request_headers = self._endpoint_and_headers(headers) logger.debug("[GalileoLunaClient] POST %s", endpoint) logger.debug("[GalileoLunaClient] Request body: %s", request_body) diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py index 7bf5de48..0f0d86d5 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py @@ -3,7 +3,6 @@ from __future__ import annotations from typing import Literal -from uuid import UUID from agent_control_evaluators import EvaluatorConfig from agent_control_models import JSONObject, JSONValue @@ -33,7 +32,6 @@ class LunaEvaluatorConfig(EvaluatorConfig): Attributes: scorer_label: Preset, registered, or fine-tuned scorer label. - project_id: Optional Galileo project UUID for project-scoped scorer resolution. threshold: Local threshold used by the evaluator for comparison. operator: Local comparison operator. Numeric operators use threshold as a number. scorer_config: Optional scorer-specific config sent as ``config``. @@ -41,10 +39,6 @@ class LunaEvaluatorConfig(EvaluatorConfig): """ scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke") - project_id: UUID | None = Field( - default=None, - description="Optional Galileo project UUID for project-scoped scorer resolution.", - ) threshold: JSONValue = Field( default=0.5, description="Local threshold used to decide whether the control matches.", diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py index f9e0ad0d..15798074 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -192,7 +192,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult: scorer_label=self.config.scorer_label, input=input_text if _has_text(input_text) else None, output=output_text if _has_text(output_text) else None, - project_id=self.config.project_id, config=self.config.scorer_config, timeout=self.get_timeout_seconds(), ) @@ -222,7 +221,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult: def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]: metadata: dict[str, Any] = { "scorer_label": response.scorer_label or self.config.scorer_label, - "project_id": str(self.config.project_id) if self.config.project_id else None, "score": response.score, "threshold": self.config.threshold, "operator": self.config.operator, diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 1b0bcef8..4e1f45b8 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -28,7 +28,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None: # Given: a direct scorer config with local thresholding config = LunaEvaluatorConfig( scorer_label="toxicity", - project_id="12345678-1234-5678-1234-567812345678", threshold=0.7, operator="gte", config={"temperature": 0}, @@ -36,7 +35,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None: # Then: config is retained without Protect concepts assert config.scorer_label == "toxicity" - assert str(config.project_id) == "12345678-1234-5678-1234-567812345678" assert config.threshold == 0.7 assert config.operator == "gte" assert config.scorer_config == {"temperature": 0} @@ -55,11 +53,10 @@ class TestGalileoLunaClient: def test_scorer_invoke_request_matches_api_schema_shape(self) -> None: from agent_control_evaluator_galileo.luna import ScorerInvokeInputs, ScorerInvokeRequest - # Given: a scorer request with project context and scorer config + # Given: a scorer request with scorer config request = ScorerInvokeRequest( scorer_label="toxicity", inputs=ScorerInvokeInputs(query={"messages": [{"role": "user", "content": "hello"}]}), - project_id="12345678-1234-5678-1234-567812345678", config={"top_k": 1}, ) @@ -70,7 +67,6 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None: "query": {"messages": [{"role": "user", "content": "hello"}]}, "response": "", }, - "project_id": "12345678-1234-5678-1234-567812345678", "config": {"top_k": 1}, } @@ -185,7 +181,6 @@ def handler(request: httpx.Request) -> httpx.Response: scorer_label="toxicity", input="user prompt", output="model answer", - project_id="12345678-1234-5678-1234-567812345678", config={"top_k": 1}, ) finally: @@ -197,7 +192,6 @@ def handler(request: httpx.Request) -> httpx.Response: assert captured["body"] == { "scorer_label": "toxicity", "inputs": {"query": "user prompt", "response": "model answer"}, - "project_id": "12345678-1234-5678-1234-567812345678", "config": {"top_k": 1}, } assert "stage_name" not in captured["body"] @@ -232,16 +226,12 @@ def handler(request: httpx.Request) -> httpx.Response: client._client = httpx.AsyncClient(transport=httpx.MockTransport(handler)) try: - # When: invoking a scorer with project context - response = await client.invoke( - scorer_label="toxicity", - output="model answer", - project_id="12345678-1234-5678-1234-567812345678", - ) + # When: invoking a scorer with internal JWT auth + response = await client.invoke(scorer_label="toxicity", output="model answer") finally: await client.close() - # Then: the internal scorer endpoint is called with a project-bound JWT + # Then: the internal scorer endpoint is called with an internal JWT assert response.score == 0.82 assert ( captured["url"] == "https://api.default.svc.cluster.local:8088/internal/scorers/invoke" @@ -249,7 +239,6 @@ def handler(request: httpx.Request) -> httpx.Response: assert captured["body"] == { "scorer_label": "toxicity", "inputs": {"query": "", "response": "model answer"}, - "project_id": "12345678-1234-5678-1234-567812345678", } headers = captured["headers"] assert isinstance(headers, dict) @@ -259,20 +248,41 @@ def handler(request: httpx.Request) -> httpx.Response: assert auth_header.startswith("Bearer ") token_payload = _decode_jwt_payload(auth_header.removeprefix("Bearer ")) assert token_payload["internal"] is True - assert token_payload["project_id"] == "12345678-1234-5678-1234-567812345678" assert token_payload["scope"] == "scorers.invoke" @pytest.mark.asyncio - async def test_client_requires_project_id_for_internal_jwt(self) -> None: + async def test_client_uses_internal_jwt_without_api_key(self) -> None: from agent_control_evaluator_galileo.luna import GalileoLunaClient # Given: a Luna client configured with internal JWT auth with patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True): client = GalileoLunaClient(api_url="https://api.default.svc.cluster.local:8088") - # When/Then: project_id is required because API uses it as the internal auth context - with pytest.raises(ValueError, match="project_id is required"): - await client.invoke(scorer_label="toxicity", output="model answer") + captured: dict[str, object] = {} + + def handler(request: httpx.Request) -> httpx.Response: + captured["headers"] = dict(request.headers) + return httpx.Response( + 200, + json={"scorer_label": "toxicity", "score": 0.82, "status": "success"}, + ) + + client._client = httpx.AsyncClient(transport=httpx.MockTransport(handler)) + try: + # When: invoking without project context + response = await client.invoke(scorer_label="toxicity", output="model answer") + finally: + await client.close() + + # Then: internal JWT auth still works + assert response.score == 0.82 + headers = captured["headers"] + assert isinstance(headers, dict) + auth_header = headers["authorization"] + assert isinstance(auth_header, str) + token_payload = _decode_jwt_payload(auth_header.removeprefix("Bearer ")) + assert token_payload["internal"] is True + assert token_payload["scope"] == "scorers.invoke" @pytest.mark.asyncio @pytest.mark.parametrize("empty_value", ["", " ", {}, []]) @@ -314,12 +324,11 @@ def test_evaluator_init_accepts_api_secret(self) -> None: evaluator = LunaEvaluator.from_dict( { "scorer_label": "toxicity", - "project_id": "12345678-1234-5678-1234-567812345678", "threshold": 0.5, } ) - assert str(evaluator.config.project_id) == "12345678-1234-5678-1234-567812345678" + assert evaluator.config.scorer_label == "toxicity" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) @pytest.mark.asyncio @@ -331,7 +340,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: evaluator = LunaEvaluator.from_dict( { "scorer_label": "toxicity", - "project_id": "12345678-1234-5678-1234-567812345678", "threshold": 0.7, "operator": "gte", "timeout_ms": 5000, @@ -360,7 +368,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: assert result.confidence == 0.82 assert result.metadata == { "scorer_label": "toxicity", - "project_id": "12345678-1234-5678-1234-567812345678", "score": 0.82, "threshold": 0.7, "operator": "gte", @@ -372,7 +379,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: scorer_label="toxicity", input="user prompt", output="model answer", - project_id=evaluator.config.project_id, config=None, timeout=5.0, ) @@ -405,7 +411,6 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None: scorer_label="toxicity", input="hello", output=None, - project_id=None, config=None, timeout=10.0, ) From 025f96f52e5d5d55330acd57a6ee300d8eb90616 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Fri, 15 May 2026 11:28:25 -0700 Subject: [PATCH 12/20] add evaluator context --- engine/src/agent_control_engine/core.py | 9 +- engine/tests/test_core.py | 192 ++++++++++++++++++ .../src/agent_control_evaluators/__init__.py | 8 +- .../src/agent_control_evaluators/_base.py | 62 +++++- evaluators/builtin/tests/test_base.py | 120 ++++++++++- .../luna/client.py | 31 ++- .../luna/config.py | 28 ++- .../luna/evaluator.py | 75 ++++++- .../galileo/tests/test_luna_evaluator.py | 66 ++++++ models/src/agent_control_models/__init__.py | 2 + models/src/agent_control_models/evaluation.py | 17 ++ .../integrations/google_adk/plugin.py | 17 +- 12 files changed, 599 insertions(+), 28 deletions(-) diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py index 99c2273b..0a8f1864 100644 --- a/engine/src/agent_control_engine/core.py +++ b/engine/src/agent_control_engine/core.py @@ -18,6 +18,7 @@ ControlAction, ControlMatch, ControlScope, + EvaluationContext, EvaluationRequest, EvaluationResponse, EvaluatorResult, @@ -188,8 +189,14 @@ async def _evaluate_leaf( if timeout <= 0: timeout = DEFAULT_EVALUATOR_TIMEOUT + context = EvaluationContext( + target_type=request.target_type, + target_id=request.target_id, + agent_name=request.agent_name, + step_type=request.step.type, + ) result = await asyncio.wait_for( - evaluator.evaluate(data), + evaluator.evaluate_with_context(data, context), timeout=timeout, ) except TimeoutError: diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py index 9c8da751..37030998 100644 --- a/engine/tests/test_core.py +++ b/engine/tests/test_core.py @@ -2353,3 +2353,195 @@ class MockControl: assert selector_errors[0].result.error is not None assert "Invalid step_name_regex" in selector_errors[0].result.error assert "[invalid(regex" in selector_errors[0].result.message + + +# ============================================================================= +# Test: EvaluationContext Propagation +# ============================================================================= + + +_observed_contexts: list[Any] = [] + + +def _reset_observed_contexts() -> None: + _observed_contexts.clear() + + +class ContextRecordingEvaluator(Evaluator[SimpleConfig]): + """Evaluator that overrides evaluate_with_context to record the context. + + Verifies that the engine populates EvaluationContext correctly. Records + every (data, context) pair it observes so concurrent invocations on a + shared instance can be inspected. + """ + + metadata = EvaluatorMetadata( + name="test-context-recorder", + version="1.0.0", + description="Records EvaluationContext", + ) + config_model = SimpleConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + # Should not be hit when the engine routes through evaluate_with_context. + _observed_contexts.append(("evaluate-fallback", data, None)) + return EvaluatorResult(matched=False, confidence=1.0, message="fallback") + + async def evaluate_with_context( + self, + data: Any, + context: Any = None, + ) -> EvaluatorResult: + _observed_contexts.append(("with-context", data, context)) + return EvaluatorResult(matched=False, confidence=1.0, message="ok") + + +class LegacyOnlyEvaluator(Evaluator[SimpleConfig]): + """Evaluator that overrides only evaluate(data), proving the default + evaluate_with_context fallback routes back to it. + """ + + metadata = EvaluatorMetadata( + name="test-legacy-only", + version="1.0.0", + description="Legacy signature only", + ) + config_model = SimpleConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + _observed_contexts.append(("legacy-evaluate", data, None)) + return EvaluatorResult(matched=False, confidence=1.0, message="legacy") + + +class TestEvaluationContextPropagation: + """Verify the engine populates and forwards EvaluationContext correctly.""" + + @pytest.fixture(autouse=True) + def _setup(self): + _reset_observed_contexts() + # Register the local fixtures (idempotent). + for cls in (ContextRecordingEvaluator, LegacyOnlyEvaluator): + try: + register_evaluator(cls) + except ValueError: + pass + yield + _reset_observed_contexts() + + @pytest.mark.asyncio + async def test_engine_populates_context_from_request(self): + """Engine builds an EvaluationContext from the request and passes it + to ``evaluate_with_context``. + """ + from agent_control_models import EvaluationContext + + controls = [ + make_control(1, "ctx1", "test-context-recorder", action="observe"), + ] + engine = ControlEngine(controls) + + request = EvaluationRequest( + agent_name="00000000-0000-0000-0000-000000000001", + step=Step(type="llm", name="step-x", input="hello", output=None), + stage="pre", + target_type="log_stream", + target_id="ls-42", + ) + await engine.process(request) + + with_context_observations = [ + entry for entry in _observed_contexts if entry[0] == "with-context" + ] + assert len(with_context_observations) == 1, _observed_contexts + _, _, context = with_context_observations[0] + assert isinstance(context, EvaluationContext) + assert context.target_type == "log_stream" + assert context.target_id == "ls-42" + assert context.agent_name == "00000000-0000-0000-0000-000000000001" + assert context.step_type == "llm" + + @pytest.mark.asyncio + async def test_engine_passes_context_with_none_target_when_unbound(self): + """When the request carries no target binding, target_* on the + context are None but the context object is still supplied. + """ + from agent_control_models import EvaluationContext + + controls = [ + make_control(1, "ctx1", "test-context-recorder", action="observe"), + ] + engine = ControlEngine(controls) + + request = EvaluationRequest( + agent_name="00000000-0000-0000-0000-000000000001", + step=Step(type="llm", name="step-y", input="x", output=None), + stage="pre", + # No target_type / target_id. + ) + await engine.process(request) + + with_context = [e for e in _observed_contexts if e[0] == "with-context"] + assert len(with_context) == 1 + context = with_context[0][2] + assert isinstance(context, EvaluationContext) + assert context.target_type is None + assert context.target_id is None + assert context.step_type == "llm" + + @pytest.mark.asyncio + async def test_legacy_evaluator_still_works_via_default_fallback(self): + """Subclasses overriding only ``evaluate(data)`` keep working: the + base ``evaluate_with_context`` default routes back to them. + """ + controls = [ + make_control(1, "ctx1", "test-legacy-only", action="observe"), + ] + engine = ControlEngine(controls) + + request = EvaluationRequest( + agent_name="00000000-0000-0000-0000-000000000001", + step=Step(type="llm", name="step-z", input="hello", output=None), + stage="pre", + target_type="log_stream", + target_id="ls-99", + ) + await engine.process(request) + + legacy_calls = [e for e in _observed_contexts if e[0] == "legacy-evaluate"] + assert len(legacy_calls) == 1, _observed_contexts + # The legacy entry point receives data only; no context object is + # observed because the default forwarder drops it to call ``evaluate``. + _, data, _ = legacy_calls[0] + assert data == "hello" + + @pytest.mark.asyncio + async def test_concurrent_requests_receive_distinct_contexts(self): + """A cached instance must observe per-call context, not a shared one.""" + from agent_control_models import EvaluationContext + + controls = [ + make_control(1, "ctx1", "test-context-recorder", action="observe"), + ] + engine = ControlEngine(controls) + + async def fire(target_id: str) -> None: + request = EvaluationRequest( + agent_name="00000000-0000-0000-0000-000000000001", + step=Step( + type="llm", name=f"step-{target_id}", input="hi", output=None + ), + stage="pre", + target_type="log_stream", + target_id=target_id, + ) + await engine.process(request) + + await asyncio.gather(*(fire(f"ls-{i}") for i in range(5))) + + with_context = [e for e in _observed_contexts if e[0] == "with-context"] + assert len(with_context) == 5 + observed_target_ids = sorted( + context.target_id for _, _, context in with_context + if isinstance(context, EvaluationContext) + ) + assert observed_target_ids == ["ls-0", "ls-1", "ls-2", "ls-3", "ls-4"] diff --git a/evaluators/builtin/src/agent_control_evaluators/__init__.py b/evaluators/builtin/src/agent_control_evaluators/__init__.py index b1dabd9e..163a20b0 100644 --- a/evaluators/builtin/src/agent_control_evaluators/__init__.py +++ b/evaluators/builtin/src/agent_control_evaluators/__init__.py @@ -28,7 +28,12 @@ __version__ = "0.0.0.dev" # Core infrastructure - export from _base and _registry -from agent_control_evaluators._base import Evaluator, EvaluatorConfig, EvaluatorMetadata +from agent_control_evaluators._base import ( + EvaluationContext, + Evaluator, + EvaluatorConfig, + EvaluatorMetadata, +) from agent_control_evaluators._discovery import ( discover_evaluators, ensure_evaluators_discovered, @@ -51,6 +56,7 @@ __all__ = [ # Core infrastructure + "EvaluationContext", "Evaluator", "EvaluatorConfig", "EvaluatorMetadata", diff --git a/evaluators/builtin/src/agent_control_evaluators/_base.py b/evaluators/builtin/src/agent_control_evaluators/_base.py index f5e6fc77..069b2e52 100644 --- a/evaluators/builtin/src/agent_control_evaluators/_base.py +++ b/evaluators/builtin/src/agent_control_evaluators/_base.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar -from agent_control_models import EvaluatorResult +from agent_control_models import EvaluationContext, EvaluatorResult from agent_control_models.base import BaseModel if TYPE_CHECKING: @@ -91,7 +91,25 @@ def __init__(self, config): async def evaluate(self, data): self.call_count += 1 # BAD: race condition, leaks between requests - Example: + Runtime Context: + Most evaluators only need the selected ``data`` to decide. Some need + request-scoped context (the bound target, agent name, step type, etc.) + to call out to an external service or change behavior per call. The + contract is: + + - ``evaluate(data)`` is the abstract entry point. Every subclass must + implement it. It is called by direct callers (tests, examples) and + serves as the canonical "no-context" path. + - ``evaluate_with_context(data, context)`` is what the engine calls. + Its default implementation delegates to ``evaluate(data)``, so + existing context-free evaluators work unchanged. + + Evaluators that need context override ``evaluate_with_context`` and + either (a) reimplement ``evaluate`` as a delegate to the new method + with ``context=None`` (the Luna pattern, recommended for symmetry) or + (b) leave ``evaluate`` as a no-context fallback. + + Example (context-free evaluator): ```python from agent_control_evaluators import ( Evaluator, @@ -120,6 +138,33 @@ async def evaluate(self, data: Any) -> EvaluatorResult: message="Evaluation complete" ) ``` + + Example (context-aware evaluator): + ```python + from agent_control_evaluators import ( + EvaluationContext, + Evaluator, + EvaluatorConfig, + EvaluatorMetadata, + register_evaluator, + ) + from agent_control_models import EvaluatorResult + + @register_evaluator + class TargetAwareEvaluator(Evaluator[MyConfig]): + metadata = EvaluatorMetadata(name="target-aware", version="1.0.0", description="") + config_model = MyConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + return await self.evaluate_with_context(data, context=None) + + async def evaluate_with_context( + self, data: Any, context: EvaluationContext | None = None + ) -> EvaluatorResult: + target_id = context.target_id if context else None + # ... use target_id in the decision ... + return EvaluatorResult(matched=False, confidence=1.0, message="ok") + ``` """ metadata: ClassVar[EvaluatorMetadata] @@ -160,6 +205,19 @@ async def evaluate(self, data: Any) -> EvaluatorResult: """ pass + async def evaluate_with_context( + self, + data: Any, + context: EvaluationContext | None = None, + ) -> EvaluatorResult: + """Evaluate data with optional runtime context. + + Evaluators that need request-scoped metadata may override this method. + The default keeps existing evaluators source-compatible by delegating to + the original ``evaluate(data)`` contract. + """ + return await self.evaluate(data) + def get_timeout_seconds(self) -> float: """Get timeout in seconds from config or metadata default.""" timeout_ms: int = getattr(self.config, "timeout_ms", self.metadata.timeout_ms) diff --git a/evaluators/builtin/tests/test_base.py b/evaluators/builtin/tests/test_base.py index 776a8d01..368fc75d 100644 --- a/evaluators/builtin/tests/test_base.py +++ b/evaluators/builtin/tests/test_base.py @@ -1,12 +1,21 @@ """Tests for evaluator base classes. -Architecture: Evaluators take config at __init__, evaluate() only takes data. +Architecture: + - ``evaluate(data)`` is the abstract entry point every subclass implements. + - ``evaluate_with_context(data, context)`` is the context-aware entry the + engine uses; the default delegates to ``evaluate(data)`` so legacy + subclasses keep working without modification. """ import pytest from typing import Any -from agent_control_evaluators import Evaluator, EvaluatorConfig, EvaluatorMetadata +from agent_control_evaluators import ( + EvaluationContext, + Evaluator, + EvaluatorConfig, + EvaluatorMetadata, +) from agent_control_models import EvaluatorResult @@ -138,3 +147,110 @@ def test_cannot_instantiate_abstract_class(self): """Test that Evaluator cannot be instantiated directly.""" with pytest.raises(TypeError, match="abstract"): Evaluator({}) # type: ignore + + +class TestEvaluateWithContext: + """Tests for the context-aware entry point on the base Evaluator.""" + + @pytest.mark.asyncio + async def test_default_evaluate_with_context_delegates_to_evaluate(self): + """A subclass that only implements ``evaluate`` is still reachable + through ``evaluate_with_context``. + """ + evaluator = MockEvaluator.from_dict({"should_match": True}) + + result = await evaluator.evaluate_with_context("payload") + + # The legacy ``evaluate`` returns matched=True and stores the data + # in metadata. If the default fallback worked, those carry through. + assert result.matched is True + assert result.metadata["data"] == "payload" + + @pytest.mark.asyncio + async def test_default_evaluate_with_context_ignores_context(self): + """The default forwarder drops the context when it calls ``evaluate`` + — this is by design so legacy implementations are unaffected. + """ + evaluator = MockEvaluator.from_dict({"should_match": False}) + + context = EvaluationContext( + target_type="log_stream", + target_id="ls-123", + agent_name="acme", + step_type="llm", + ) + + # Should not raise, even though MockEvaluator.evaluate has no kwargs + # for context. The default forwarder strips it. + result = await evaluator.evaluate_with_context("data", context) + + assert result.matched is False + assert result.metadata["data"] == "data" + + @pytest.mark.asyncio + async def test_subclass_can_override_evaluate_with_context(self): + """A subclass override of ``evaluate_with_context`` is preferred over + the default fallback when the engine calls it. + """ + + class ContextAwareConfig(EvaluatorConfig): + pass + + class ContextAware(Evaluator[ContextAwareConfig]): + metadata = EvaluatorMetadata( + name="ctx-aware", + version="1.0.0", + description="", + ) + config_model = ContextAwareConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + # Canonical "no-context" delegate pattern. + return await self.evaluate_with_context(data, context=None) + + async def evaluate_with_context( + self, + data: Any, + context: EvaluationContext | None = None, + ) -> EvaluatorResult: + target_id = context.target_id if context else "no-target" + return EvaluatorResult( + matched=True, + confidence=1.0, + message=f"saw {target_id}", + ) + + evaluator = ContextAware.from_dict({}) + + ctx = EvaluationContext(target_type="log_stream", target_id="ls-7") + result = await evaluator.evaluate_with_context("data", ctx) + assert result.message == "saw ls-7" + + # The Luna-pattern ``evaluate`` should also work as the no-context path. + result_no_ctx = await evaluator.evaluate("data") + assert result_no_ctx.message == "saw no-target" + + @pytest.mark.asyncio + async def test_evaluation_context_defaults_are_none(self): + """All EvaluationContext fields default to None and the dataclass is + constructible with no arguments. Regression guard against orphan + fields that have no populator on the engine side. + """ + ctx = EvaluationContext() + assert ctx.target_type is None + assert ctx.target_id is None + assert ctx.agent_name is None + assert ctx.step_type is None + # Confirm we did not silently keep namespace_key around; reading an + # unknown attribute should fail. + with pytest.raises(AttributeError): + _ = ctx.namespace_key # type: ignore[attr-defined] + + def test_evaluation_context_is_importable_from_evaluators_package(self): + """EvaluationContext is re-exported from agent_control_evaluators so + subclasses can colocate their imports. + """ + from agent_control_evaluators import EvaluationContext as Reexported + from agent_control_models import EvaluationContext as Canonical + + assert Reexported is Canonical diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index caca997e..6c2b7d61 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -86,16 +86,26 @@ class ScorerInvokeRequest(BaseModel): Attributes: inputs: Selected scorer input values. + logstream_id: Optional Galileo log stream identifier for runtime context. scorer_label: Preset, registered, or fine-tuned scorer label. + scorer_id: Optional Galileo scorer identifier. + scorer_version_id: Optional Galileo scorer version identifier. config: Optional scorer-specific configuration. """ - scorer_label: str = Field(min_length=1) inputs: ScorerInvokeInputs + logstream_id: str | None = Field(default=None, min_length=1) + scorer_label: str | None = Field(default=None, min_length=1) + scorer_id: str | None = Field(default=None, min_length=1) + scorer_version_id: str | None = Field(default=None, min_length=1) config: JSONObject | None = None @model_validator(mode="after") - def ensure_input_or_output(self) -> ScorerInvokeRequest: + def ensure_required_values(self) -> ScorerInvokeRequest: + if not (self.scorer_label or self.scorer_id or self.scorer_version_id): + raise ValueError( + "One of scorer_label, scorer_id, or scorer_version_id must be set." + ) if not (_has_value(self.inputs.query) or _has_value(self.inputs.response)): raise ValueError("Either inputs.query or inputs.response must be set.") return self @@ -109,14 +119,14 @@ class ScorerInvokeResponse(BaseModel): """Response from Galileo Luna scorer invocation. Attributes: - scorer_label: Echoed scorer label. + scorer_label: Echoed scorer label, when returned. score: Raw scorer value. status: Invocation status. execution_time: Execution time in seconds, when returned. error_message: Error detail for non-success statuses. """ - scorer_label: str + scorer_label: str | None = None score: JSONValue status: str = "unknown" execution_time: float | None = None @@ -229,7 +239,10 @@ def _endpoint_and_headers( async def invoke( self, *, - scorer_label: str, + scorer_label: str | None = None, + scorer_id: str | None = None, + scorer_version_id: str | None = None, + logstream_id: str | None = None, input: JSONValue = None, output: JSONValue = None, config: JSONObject | None = None, @@ -240,6 +253,9 @@ async def invoke( Args: scorer_label: Preset, registered, or fine-tuned scorer label. + scorer_id: Optional Galileo scorer identifier. + scorer_version_id: Optional Galileo scorer version identifier. + logstream_id: Optional Galileo log stream identifier for runtime context. input: Optional user/system prompt text. output: Optional model response text. config: Optional scorer-specific configuration. @@ -255,11 +271,16 @@ async def invoke( httpx.HTTPStatusError: If the API returns an error status code. httpx.RequestError: If the request fails before a response is received. """ + if not (scorer_label or scorer_id or scorer_version_id): + raise ValueError("At least one scorer identifier must be provided.") if not (_has_value(input) or _has_value(output)): raise ValueError("At least one of input or output must be provided.") request_body = ScorerInvokeRequest( scorer_label=scorer_label, + scorer_id=scorer_id, + scorer_version_id=scorer_version_id, + logstream_id=logstream_id, inputs=ScorerInvokeInputs( query="" if input is None else input, response="" if output is None else output ), diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py index 0f0d86d5..1136ffbe 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py @@ -31,14 +31,36 @@ class LunaEvaluatorConfig(EvaluatorConfig): """Configuration for direct Luna scorer evaluation. Attributes: + logstream_id: Optional Galileo log stream identifier used as runtime context. scorer_label: Preset, registered, or fine-tuned scorer label. + scorer_id: Optional Galileo scorer identifier. + scorer_version_id: Optional Galileo scorer version identifier. threshold: Local threshold used by the evaluator for comparison. operator: Local comparison operator. Numeric operators use threshold as a number. scorer_config: Optional scorer-specific config sent as ``config``. timeout_ms: Request timeout in milliseconds. """ - scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke") + logstream_id: str | None = Field( + default=None, + min_length=1, + description="Optional Galileo log stream identifier used as scorer runtime context.", + ) + scorer_label: str | None = Field( + default=None, + min_length=1, + description="Luna scorer label to invoke.", + ) + scorer_id: str | None = Field( + default=None, + min_length=1, + description="Optional Galileo scorer identifier to invoke.", + ) + scorer_version_id: str | None = Field( + default=None, + min_length=1, + description="Optional Galileo scorer version identifier to invoke.", + ) threshold: JSONValue = Field( default=0.5, description="Local threshold used to decide whether the control matches.", @@ -63,6 +85,10 @@ class LunaEvaluatorConfig(EvaluatorConfig): @model_validator(mode="after") def validate_threshold(self) -> LunaEvaluatorConfig: """Validate threshold compatibility with the configured operator.""" + if not (self.scorer_label or self.scorer_id or self.scorer_version_id): + raise ValueError( + "one of scorer_label, scorer_id, or scorer_version_id is required" + ) if self.operator in _NUMERIC_OPERATORS and coerce_number(self.threshold) is None: raise ValueError(f"operator '{self.operator}' requires a numeric threshold") if self.operator != "any" and self.threshold is None: diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py index 15798074..79749a84 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -9,7 +9,7 @@ from typing import Any from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator -from agent_control_models import EvaluatorResult, JSONValue +from agent_control_models import EvaluationContext, EvaluatorResult, JSONValue from .client import GalileoLunaClient, ScorerInvokeResponse from .config import LunaEvaluatorConfig, coerce_number @@ -76,6 +76,10 @@ def _confidence_from_score(score: JSONValue) -> float: return 1.0 +def _is_logstream_target(target_type: str | None) -> bool: + return (target_type or "").lower() in {"logstream", "log_stream", "log-stream"} + + @register_evaluator class LunaEvaluator(Evaluator[LunaEvaluatorConfig]): """Galileo Luna evaluator using the direct scorer invocation API.""" @@ -133,7 +137,8 @@ def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]: return input_text, output_text text = _coerce_payload_text(data) - if "output" in self.config.scorer_label: + scorer_label = self.config.scorer_label or "" + if "output" in scorer_label: return None, text return text, None @@ -170,26 +175,37 @@ def _score_matches(self, score: JSONValue) -> bool: raise ValueError(f"Unsupported Luna operator: {operator}") async def evaluate(self, data: Any) -> EvaluatorResult: + """Evaluate selected data without runtime context.""" + return await self.evaluate_with_context(data, context=None) + + async def evaluate_with_context( + self, + data: Any, + context: EvaluationContext | None = None, + ) -> EvaluatorResult: """Evaluate selected data with Galileo Luna direct scorer invocation. Args: data: The data selected from the runtime step. + context: Optional runtime context from the engine. Returns: EvaluatorResult with local threshold decision and scorer metadata. """ input_text, output_text = self._prepare_payload(data) + logstream_id = self._resolve_logstream_id(context) if not (_has_text(input_text) or _has_text(output_text)): return EvaluatorResult( matched=False, confidence=1.0, message="No data to score with Luna", - metadata={"scorer_label": self.config.scorer_label}, + metadata=self._base_metadata(logstream_id=logstream_id), ) try: + scorer_kwargs = self._scorer_kwargs(logstream_id=logstream_id) response = await self._get_client().invoke( - scorer_label=self.config.scorer_label, + **scorer_kwargs, input=input_text if _has_text(input_text) else None, output=output_text if _has_text(output_text) else None, config=self.config.scorer_config, @@ -201,7 +217,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: raise RuntimeError(message) matched = self._score_matches(response.score) - metadata = self._metadata(response) + metadata = self._metadata(response, logstream_id=logstream_id) operator = self.config.operator threshold = self.config.threshold state = "triggered" if matched else "not triggered" @@ -216,10 +232,39 @@ async def evaluate(self, data: Any) -> EvaluatorResult: ) except Exception as exc: logger.error("Luna evaluation error: %s", exc, exc_info=True) - return self._handle_error(exc) - - def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]: - metadata: dict[str, Any] = { + return self._handle_error(exc, logstream_id=logstream_id) + + def _resolve_logstream_id(self, context: EvaluationContext | None) -> str | None: + if context is not None and _is_logstream_target(context.target_type): + return context.target_id + return self.config.logstream_id + + def _base_metadata(self, *, logstream_id: str | None = None) -> dict[str, Any]: + metadata = { + "logstream_id": logstream_id, + "scorer_label": self.config.scorer_label, + "scorer_id": self.config.scorer_id, + "scorer_version_id": self.config.scorer_version_id, + } + return {key: value for key, value in metadata.items() if value is not None} + + def _scorer_kwargs(self, *, logstream_id: str | None = None) -> dict[str, Any]: + kwargs = { + "logstream_id": logstream_id, + "scorer_label": self.config.scorer_label, + "scorer_id": self.config.scorer_id, + "scorer_version_id": self.config.scorer_version_id, + } + return {key: value for key, value in kwargs.items() if value is not None} + + def _metadata( + self, + response: ScorerInvokeResponse, + *, + logstream_id: str | None = None, + ) -> dict[str, Any]: + metadata: dict[str, Any] = self._base_metadata(logstream_id=logstream_id) + metadata.update({ "scorer_label": response.scorer_label or self.config.scorer_label, "score": response.score, "threshold": self.config.threshold, @@ -227,10 +272,15 @@ def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]: "status": response.status, "execution_time_seconds": response.execution_time, "error_message": response.error_message, - } + }) return metadata - def _handle_error(self, error: Exception) -> EvaluatorResult: + def _handle_error( + self, + error: Exception, + *, + logstream_id: str | None = None, + ) -> EvaluatorResult: error_detail = str(error) return EvaluatorResult( matched=False, @@ -240,6 +290,9 @@ def _handle_error(self, error: Exception) -> EvaluatorResult: "error": error_detail, "error_type": type(error).__name__, "scorer_label": self.config.scorer_label, + "scorer_id": self.config.scorer_id, + "scorer_version_id": self.config.scorer_version_id, + "logstream_id": logstream_id, }, error=error_detail, ) diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 4e1f45b8..201d9f73 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -27,18 +27,38 @@ def test_config_accepts_direct_scorer_fields(self) -> None: # Given: a direct scorer config with local thresholding config = LunaEvaluatorConfig( + logstream_id="logstream-123", scorer_label="toxicity", + scorer_id="scorer-123", + scorer_version_id="version-123", threshold=0.7, operator="gte", config={"temperature": 0}, ) # Then: config is retained without Protect concepts + assert config.logstream_id == "logstream-123" assert config.scorer_label == "toxicity" + assert config.scorer_id == "scorer-123" + assert config.scorer_version_id == "version-123" assert config.threshold == 0.7 assert config.operator == "gte" assert config.scorer_config == {"temperature": 0} + def test_config_accepts_scorer_id_without_label(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig + + config = LunaEvaluatorConfig(scorer_id="scorer-123") + + assert config.scorer_id == "scorer-123" + assert config.scorer_label is None + + def test_config_requires_a_scorer_identifier(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig + + with pytest.raises(ValidationError, match="one of scorer_label"): + LunaEvaluatorConfig(threshold=0.5) + def test_numeric_operator_requires_numeric_threshold(self) -> None: from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig @@ -55,14 +75,20 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None: # Given: a scorer request with scorer config request = ScorerInvokeRequest( + logstream_id="logstream-123", scorer_label="toxicity", + scorer_id="scorer-123", + scorer_version_id="version-123", inputs=ScorerInvokeInputs(query={"messages": [{"role": "user", "content": "hello"}]}), config={"top_k": 1}, ) # Then: the serialized payload uses the API-owned scorer invoke fields assert request.to_dict() == { + "logstream_id": "logstream-123", "scorer_label": "toxicity", + "scorer_id": "scorer-123", + "scorer_version_id": "version-123", "inputs": { "query": {"messages": [{"role": "user", "content": "hello"}]}, "response": "", @@ -383,6 +409,46 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: timeout=5.0, ) + @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) + @pytest.mark.asyncio + async def test_evaluator_passes_logstream_id_from_runtime_context(self) -> None: + from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + from agent_control_models import EvaluationContext + + evaluator = LunaEvaluator.from_dict( + { + "logstream_id": "config-logstream", + "scorer_label": "toxicity", + "scorer_id": "scorer-123", + "scorer_version_id": "version-123", + } + ) + + with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: + mock_invoke.return_value = ScorerInvokeResponse( + scorer_label="toxicity", + score=0.82, + status="success", + ) + + result = await evaluator.evaluate_with_context( + "hello", + EvaluationContext(target_type="log_stream", target_id="runtime-logstream"), + ) + + assert result.metadata["logstream_id"] == "runtime-logstream" + mock_invoke.assert_awaited_once_with( + logstream_id="runtime-logstream", + scorer_label="toxicity", + scorer_id="scorer-123", + scorer_version_id="version-123", + input="hello", + output=None, + config=None, + timeout=10.0, + ) + @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) @pytest.mark.asyncio async def test_evaluator_returns_non_match_below_threshold(self) -> None: diff --git a/models/src/agent_control_models/__init__.py b/models/src/agent_control_models/__init__.py index 148cdd7a..867367bd 100644 --- a/models/src/agent_control_models/__init__.py +++ b/models/src/agent_control_models/__init__.py @@ -61,6 +61,7 @@ make_error_type, ) from .evaluation import ( + EvaluationContext, EvaluationRequest, EvaluationResponse, EvaluationResult, @@ -132,6 +133,7 @@ # Policy "Policy", # Evaluation + "EvaluationContext", "EvaluationRequest", "EvaluationResponse", "EvaluationResult", diff --git a/models/src/agent_control_models/evaluation.py b/models/src/agent_control_models/evaluation.py index 50b5791b..5c81b0c1 100644 --- a/models/src/agent_control_models/evaluation.py +++ b/models/src/agent_control_models/evaluation.py @@ -1,4 +1,5 @@ """Evaluation-related models.""" +from dataclasses import dataclass from typing import Annotated, Literal, Self from pydantic import Field, StringConstraints, field_validator, model_validator @@ -13,6 +14,22 @@ ] +@dataclass +class EvaluationContext: + """Runtime context available while evaluating a control. + + This is intentionally small and mutable by normal dataclass semantics so + downstream users can subclass it with richer runtime context when needed. + Only fields the engine actually populates today are declared here; add new + fields only when there is a populator on every supported call path. + """ + + target_type: str | None = None + target_id: str | None = None + agent_name: str | None = None + step_type: str | None = None + + class EvaluationRequest(BaseModel): """ Request model for evaluation analysis. diff --git a/sdks/python/src/agent_control/integrations/google_adk/plugin.py b/sdks/python/src/agent_control/integrations/google_adk/plugin.py index eb2155c8..28e59698 100644 --- a/sdks/python/src/agent_control/integrations/google_adk/plugin.py +++ b/sdks/python/src/agent_control/integrations/google_adk/plugin.py @@ -22,11 +22,18 @@ from agent_control.validation import ensure_agent_name try: - from google.adk.agents.callback_context import CallbackContext # type: ignore[import-not-found] - from google.adk.models import LlmRequest, LlmResponse # type: ignore[import-not-found] - from google.adk.plugins import BasePlugin # type: ignore[import-not-found] - from google.adk.tools import BaseTool # type: ignore[import-not-found] - from google.adk.tools.tool_context import ToolContext # type: ignore[import-not-found] + from google.adk.agents.callback_context import ( # type: ignore[import-not-found,import-untyped] + CallbackContext, + ) + from google.adk.models import ( # type: ignore[import-not-found,import-untyped] + LlmRequest, + LlmResponse, + ) + from google.adk.plugins import BasePlugin # type: ignore[import-not-found,import-untyped] + from google.adk.tools import BaseTool # type: ignore[import-not-found,import-untyped] + from google.adk.tools.tool_context import ( # type: ignore[import-not-found,import-untyped] + ToolContext, + ) except Exception as exc: # pragma: no cover - optional dependency raise RuntimeError( "Google ADK integration requires google-adk. " From 15e7a01c1f9a76430cce2870a8fb72bf80796303 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Fri, 15 May 2026 12:57:42 -0700 Subject: [PATCH 13/20] remove evaluation context --- engine/src/agent_control_engine/core.py | 9 +- engine/tests/test_core.py | 242 ++++-------------- .../src/agent_control_evaluators/__init__.py | 2 - .../src/agent_control_evaluators/_base.py | 61 +---- evaluators/builtin/tests/test_base.py | 120 +-------- .../luna/client.py | 5 - .../luna/config.py | 6 - .../luna/evaluator.py | 42 +-- .../galileo/tests/test_luna_evaluator.py | 44 ---- models/src/agent_control_models/__init__.py | 2 - models/src/agent_control_models/evaluation.py | 17 -- 11 files changed, 56 insertions(+), 494 deletions(-) diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py index 0a8f1864..99c2273b 100644 --- a/engine/src/agent_control_engine/core.py +++ b/engine/src/agent_control_engine/core.py @@ -18,7 +18,6 @@ ControlAction, ControlMatch, ControlScope, - EvaluationContext, EvaluationRequest, EvaluationResponse, EvaluatorResult, @@ -189,14 +188,8 @@ async def _evaluate_leaf( if timeout <= 0: timeout = DEFAULT_EVALUATOR_TIMEOUT - context = EvaluationContext( - target_type=request.target_type, - target_id=request.target_id, - agent_name=request.agent_name, - step_type=request.step.type, - ) result = await asyncio.wait_for( - evaluator.evaluate_with_context(data, context), + evaluator.evaluate(data), timeout=timeout, ) except TimeoutError: diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py index 37030998..78eda0ab 100644 --- a/engine/tests/test_core.py +++ b/engine/tests/test_core.py @@ -831,7 +831,13 @@ async def test_confidence_is_full_on_deny_match(self): controls = [ make_control(1, "denier", "test-deny", action="deny", config_value="d"), ] + [ - make_control(i + 2, f"blocker{i}", "test-blocker", action="observe", config_value=str(i)) + make_control( + i + 2, + f"blocker{i}", + "test-blocker", + action="observe", + config_value=str(i), + ) for i in range(9) ] engine = ControlEngine(controls) @@ -1830,10 +1836,20 @@ async def test_server_context_only_runs_server_controls(self): """ controls = [ make_control_with_execution( - 1, "local_ctrl", "test-allow", action="observe", config_value="loc", execution="sdk" + 1, + "local_ctrl", + "test-allow", + action="observe", + config_value="loc", + execution="sdk", ), make_control_with_execution( - 2, "server_ctrl", "test-allow", action="observe", config_value="srv", execution="server" + 2, + "server_ctrl", + "test-allow", + action="observe", + config_value="srv", + execution="server", ), ] engine = ControlEngine(controls, context="server") @@ -1860,10 +1876,20 @@ async def test_sdk_context_only_runs_sdk_controls(self): """ controls = [ make_control_with_execution( - 1, "local_ctrl", "test-allow", action="observe", config_value="loc", execution="sdk" + 1, + "local_ctrl", + "test-allow", + action="observe", + config_value="loc", + execution="sdk", ), make_control_with_execution( - 2, "server_ctrl", "test-allow", action="observe", config_value="srv", execution="server" + 2, + "server_ctrl", + "test-allow", + action="observe", + config_value="srv", + execution="server", ), ] engine = ControlEngine(controls, context="sdk") @@ -1890,10 +1916,20 @@ async def test_default_context_is_server(self): """ controls = [ make_control_with_execution( - 1, "local_ctrl", "test-allow", action="observe", config_value="loc", execution="sdk" + 1, + "local_ctrl", + "test-allow", + action="observe", + config_value="loc", + execution="sdk", ), make_control_with_execution( - 2, "server_ctrl", "test-allow", action="observe", config_value="srv", execution="server" + 2, + "server_ctrl", + "test-allow", + action="observe", + config_value="srv", + execution="server", ), ] engine = ControlEngine(controls) # No context param @@ -2353,195 +2389,3 @@ class MockControl: assert selector_errors[0].result.error is not None assert "Invalid step_name_regex" in selector_errors[0].result.error assert "[invalid(regex" in selector_errors[0].result.message - - -# ============================================================================= -# Test: EvaluationContext Propagation -# ============================================================================= - - -_observed_contexts: list[Any] = [] - - -def _reset_observed_contexts() -> None: - _observed_contexts.clear() - - -class ContextRecordingEvaluator(Evaluator[SimpleConfig]): - """Evaluator that overrides evaluate_with_context to record the context. - - Verifies that the engine populates EvaluationContext correctly. Records - every (data, context) pair it observes so concurrent invocations on a - shared instance can be inspected. - """ - - metadata = EvaluatorMetadata( - name="test-context-recorder", - version="1.0.0", - description="Records EvaluationContext", - ) - config_model = SimpleConfig - - async def evaluate(self, data: Any) -> EvaluatorResult: - # Should not be hit when the engine routes through evaluate_with_context. - _observed_contexts.append(("evaluate-fallback", data, None)) - return EvaluatorResult(matched=False, confidence=1.0, message="fallback") - - async def evaluate_with_context( - self, - data: Any, - context: Any = None, - ) -> EvaluatorResult: - _observed_contexts.append(("with-context", data, context)) - return EvaluatorResult(matched=False, confidence=1.0, message="ok") - - -class LegacyOnlyEvaluator(Evaluator[SimpleConfig]): - """Evaluator that overrides only evaluate(data), proving the default - evaluate_with_context fallback routes back to it. - """ - - metadata = EvaluatorMetadata( - name="test-legacy-only", - version="1.0.0", - description="Legacy signature only", - ) - config_model = SimpleConfig - - async def evaluate(self, data: Any) -> EvaluatorResult: - _observed_contexts.append(("legacy-evaluate", data, None)) - return EvaluatorResult(matched=False, confidence=1.0, message="legacy") - - -class TestEvaluationContextPropagation: - """Verify the engine populates and forwards EvaluationContext correctly.""" - - @pytest.fixture(autouse=True) - def _setup(self): - _reset_observed_contexts() - # Register the local fixtures (idempotent). - for cls in (ContextRecordingEvaluator, LegacyOnlyEvaluator): - try: - register_evaluator(cls) - except ValueError: - pass - yield - _reset_observed_contexts() - - @pytest.mark.asyncio - async def test_engine_populates_context_from_request(self): - """Engine builds an EvaluationContext from the request and passes it - to ``evaluate_with_context``. - """ - from agent_control_models import EvaluationContext - - controls = [ - make_control(1, "ctx1", "test-context-recorder", action="observe"), - ] - engine = ControlEngine(controls) - - request = EvaluationRequest( - agent_name="00000000-0000-0000-0000-000000000001", - step=Step(type="llm", name="step-x", input="hello", output=None), - stage="pre", - target_type="log_stream", - target_id="ls-42", - ) - await engine.process(request) - - with_context_observations = [ - entry for entry in _observed_contexts if entry[0] == "with-context" - ] - assert len(with_context_observations) == 1, _observed_contexts - _, _, context = with_context_observations[0] - assert isinstance(context, EvaluationContext) - assert context.target_type == "log_stream" - assert context.target_id == "ls-42" - assert context.agent_name == "00000000-0000-0000-0000-000000000001" - assert context.step_type == "llm" - - @pytest.mark.asyncio - async def test_engine_passes_context_with_none_target_when_unbound(self): - """When the request carries no target binding, target_* on the - context are None but the context object is still supplied. - """ - from agent_control_models import EvaluationContext - - controls = [ - make_control(1, "ctx1", "test-context-recorder", action="observe"), - ] - engine = ControlEngine(controls) - - request = EvaluationRequest( - agent_name="00000000-0000-0000-0000-000000000001", - step=Step(type="llm", name="step-y", input="x", output=None), - stage="pre", - # No target_type / target_id. - ) - await engine.process(request) - - with_context = [e for e in _observed_contexts if e[0] == "with-context"] - assert len(with_context) == 1 - context = with_context[0][2] - assert isinstance(context, EvaluationContext) - assert context.target_type is None - assert context.target_id is None - assert context.step_type == "llm" - - @pytest.mark.asyncio - async def test_legacy_evaluator_still_works_via_default_fallback(self): - """Subclasses overriding only ``evaluate(data)`` keep working: the - base ``evaluate_with_context`` default routes back to them. - """ - controls = [ - make_control(1, "ctx1", "test-legacy-only", action="observe"), - ] - engine = ControlEngine(controls) - - request = EvaluationRequest( - agent_name="00000000-0000-0000-0000-000000000001", - step=Step(type="llm", name="step-z", input="hello", output=None), - stage="pre", - target_type="log_stream", - target_id="ls-99", - ) - await engine.process(request) - - legacy_calls = [e for e in _observed_contexts if e[0] == "legacy-evaluate"] - assert len(legacy_calls) == 1, _observed_contexts - # The legacy entry point receives data only; no context object is - # observed because the default forwarder drops it to call ``evaluate``. - _, data, _ = legacy_calls[0] - assert data == "hello" - - @pytest.mark.asyncio - async def test_concurrent_requests_receive_distinct_contexts(self): - """A cached instance must observe per-call context, not a shared one.""" - from agent_control_models import EvaluationContext - - controls = [ - make_control(1, "ctx1", "test-context-recorder", action="observe"), - ] - engine = ControlEngine(controls) - - async def fire(target_id: str) -> None: - request = EvaluationRequest( - agent_name="00000000-0000-0000-0000-000000000001", - step=Step( - type="llm", name=f"step-{target_id}", input="hi", output=None - ), - stage="pre", - target_type="log_stream", - target_id=target_id, - ) - await engine.process(request) - - await asyncio.gather(*(fire(f"ls-{i}") for i in range(5))) - - with_context = [e for e in _observed_contexts if e[0] == "with-context"] - assert len(with_context) == 5 - observed_target_ids = sorted( - context.target_id for _, _, context in with_context - if isinstance(context, EvaluationContext) - ) - assert observed_target_ids == ["ls-0", "ls-1", "ls-2", "ls-3", "ls-4"] diff --git a/evaluators/builtin/src/agent_control_evaluators/__init__.py b/evaluators/builtin/src/agent_control_evaluators/__init__.py index 163a20b0..d435d801 100644 --- a/evaluators/builtin/src/agent_control_evaluators/__init__.py +++ b/evaluators/builtin/src/agent_control_evaluators/__init__.py @@ -29,7 +29,6 @@ # Core infrastructure - export from _base and _registry from agent_control_evaluators._base import ( - EvaluationContext, Evaluator, EvaluatorConfig, EvaluatorMetadata, @@ -56,7 +55,6 @@ __all__ = [ # Core infrastructure - "EvaluationContext", "Evaluator", "EvaluatorConfig", "EvaluatorMetadata", diff --git a/evaluators/builtin/src/agent_control_evaluators/_base.py b/evaluators/builtin/src/agent_control_evaluators/_base.py index 069b2e52..bf36f8c1 100644 --- a/evaluators/builtin/src/agent_control_evaluators/_base.py +++ b/evaluators/builtin/src/agent_control_evaluators/_base.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar -from agent_control_models import EvaluationContext, EvaluatorResult +from agent_control_models import EvaluatorResult from agent_control_models.base import BaseModel if TYPE_CHECKING: @@ -91,25 +91,7 @@ def __init__(self, config): async def evaluate(self, data): self.call_count += 1 # BAD: race condition, leaks between requests - Runtime Context: - Most evaluators only need the selected ``data`` to decide. Some need - request-scoped context (the bound target, agent name, step type, etc.) - to call out to an external service or change behavior per call. The - contract is: - - - ``evaluate(data)`` is the abstract entry point. Every subclass must - implement it. It is called by direct callers (tests, examples) and - serves as the canonical "no-context" path. - - ``evaluate_with_context(data, context)`` is what the engine calls. - Its default implementation delegates to ``evaluate(data)``, so - existing context-free evaluators work unchanged. - - Evaluators that need context override ``evaluate_with_context`` and - either (a) reimplement ``evaluate`` as a delegate to the new method - with ``context=None`` (the Luna pattern, recommended for symmetry) or - (b) leave ``evaluate`` as a no-context fallback. - - Example (context-free evaluator): + Example: ```python from agent_control_evaluators import ( Evaluator, @@ -139,32 +121,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult: ) ``` - Example (context-aware evaluator): - ```python - from agent_control_evaluators import ( - EvaluationContext, - Evaluator, - EvaluatorConfig, - EvaluatorMetadata, - register_evaluator, - ) - from agent_control_models import EvaluatorResult - - @register_evaluator - class TargetAwareEvaluator(Evaluator[MyConfig]): - metadata = EvaluatorMetadata(name="target-aware", version="1.0.0", description="") - config_model = MyConfig - - async def evaluate(self, data: Any) -> EvaluatorResult: - return await self.evaluate_with_context(data, context=None) - - async def evaluate_with_context( - self, data: Any, context: EvaluationContext | None = None - ) -> EvaluatorResult: - target_id = context.target_id if context else None - # ... use target_id in the decision ... - return EvaluatorResult(matched=False, confidence=1.0, message="ok") - ``` """ metadata: ClassVar[EvaluatorMetadata] @@ -205,19 +161,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult: """ pass - async def evaluate_with_context( - self, - data: Any, - context: EvaluationContext | None = None, - ) -> EvaluatorResult: - """Evaluate data with optional runtime context. - - Evaluators that need request-scoped metadata may override this method. - The default keeps existing evaluators source-compatible by delegating to - the original ``evaluate(data)`` contract. - """ - return await self.evaluate(data) - def get_timeout_seconds(self) -> float: """Get timeout in seconds from config or metadata default.""" timeout_ms: int = getattr(self.config, "timeout_ms", self.metadata.timeout_ms) diff --git a/evaluators/builtin/tests/test_base.py b/evaluators/builtin/tests/test_base.py index 368fc75d..776a8d01 100644 --- a/evaluators/builtin/tests/test_base.py +++ b/evaluators/builtin/tests/test_base.py @@ -1,21 +1,12 @@ """Tests for evaluator base classes. -Architecture: - - ``evaluate(data)`` is the abstract entry point every subclass implements. - - ``evaluate_with_context(data, context)`` is the context-aware entry the - engine uses; the default delegates to ``evaluate(data)`` so legacy - subclasses keep working without modification. +Architecture: Evaluators take config at __init__, evaluate() only takes data. """ import pytest from typing import Any -from agent_control_evaluators import ( - EvaluationContext, - Evaluator, - EvaluatorConfig, - EvaluatorMetadata, -) +from agent_control_evaluators import Evaluator, EvaluatorConfig, EvaluatorMetadata from agent_control_models import EvaluatorResult @@ -147,110 +138,3 @@ def test_cannot_instantiate_abstract_class(self): """Test that Evaluator cannot be instantiated directly.""" with pytest.raises(TypeError, match="abstract"): Evaluator({}) # type: ignore - - -class TestEvaluateWithContext: - """Tests for the context-aware entry point on the base Evaluator.""" - - @pytest.mark.asyncio - async def test_default_evaluate_with_context_delegates_to_evaluate(self): - """A subclass that only implements ``evaluate`` is still reachable - through ``evaluate_with_context``. - """ - evaluator = MockEvaluator.from_dict({"should_match": True}) - - result = await evaluator.evaluate_with_context("payload") - - # The legacy ``evaluate`` returns matched=True and stores the data - # in metadata. If the default fallback worked, those carry through. - assert result.matched is True - assert result.metadata["data"] == "payload" - - @pytest.mark.asyncio - async def test_default_evaluate_with_context_ignores_context(self): - """The default forwarder drops the context when it calls ``evaluate`` - — this is by design so legacy implementations are unaffected. - """ - evaluator = MockEvaluator.from_dict({"should_match": False}) - - context = EvaluationContext( - target_type="log_stream", - target_id="ls-123", - agent_name="acme", - step_type="llm", - ) - - # Should not raise, even though MockEvaluator.evaluate has no kwargs - # for context. The default forwarder strips it. - result = await evaluator.evaluate_with_context("data", context) - - assert result.matched is False - assert result.metadata["data"] == "data" - - @pytest.mark.asyncio - async def test_subclass_can_override_evaluate_with_context(self): - """A subclass override of ``evaluate_with_context`` is preferred over - the default fallback when the engine calls it. - """ - - class ContextAwareConfig(EvaluatorConfig): - pass - - class ContextAware(Evaluator[ContextAwareConfig]): - metadata = EvaluatorMetadata( - name="ctx-aware", - version="1.0.0", - description="", - ) - config_model = ContextAwareConfig - - async def evaluate(self, data: Any) -> EvaluatorResult: - # Canonical "no-context" delegate pattern. - return await self.evaluate_with_context(data, context=None) - - async def evaluate_with_context( - self, - data: Any, - context: EvaluationContext | None = None, - ) -> EvaluatorResult: - target_id = context.target_id if context else "no-target" - return EvaluatorResult( - matched=True, - confidence=1.0, - message=f"saw {target_id}", - ) - - evaluator = ContextAware.from_dict({}) - - ctx = EvaluationContext(target_type="log_stream", target_id="ls-7") - result = await evaluator.evaluate_with_context("data", ctx) - assert result.message == "saw ls-7" - - # The Luna-pattern ``evaluate`` should also work as the no-context path. - result_no_ctx = await evaluator.evaluate("data") - assert result_no_ctx.message == "saw no-target" - - @pytest.mark.asyncio - async def test_evaluation_context_defaults_are_none(self): - """All EvaluationContext fields default to None and the dataclass is - constructible with no arguments. Regression guard against orphan - fields that have no populator on the engine side. - """ - ctx = EvaluationContext() - assert ctx.target_type is None - assert ctx.target_id is None - assert ctx.agent_name is None - assert ctx.step_type is None - # Confirm we did not silently keep namespace_key around; reading an - # unknown attribute should fail. - with pytest.raises(AttributeError): - _ = ctx.namespace_key # type: ignore[attr-defined] - - def test_evaluation_context_is_importable_from_evaluators_package(self): - """EvaluationContext is re-exported from agent_control_evaluators so - subclasses can colocate their imports. - """ - from agent_control_evaluators import EvaluationContext as Reexported - from agent_control_models import EvaluationContext as Canonical - - assert Reexported is Canonical diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index 6c2b7d61..51d34c96 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -86,7 +86,6 @@ class ScorerInvokeRequest(BaseModel): Attributes: inputs: Selected scorer input values. - logstream_id: Optional Galileo log stream identifier for runtime context. scorer_label: Preset, registered, or fine-tuned scorer label. scorer_id: Optional Galileo scorer identifier. scorer_version_id: Optional Galileo scorer version identifier. @@ -94,7 +93,6 @@ class ScorerInvokeRequest(BaseModel): """ inputs: ScorerInvokeInputs - logstream_id: str | None = Field(default=None, min_length=1) scorer_label: str | None = Field(default=None, min_length=1) scorer_id: str | None = Field(default=None, min_length=1) scorer_version_id: str | None = Field(default=None, min_length=1) @@ -242,7 +240,6 @@ async def invoke( scorer_label: str | None = None, scorer_id: str | None = None, scorer_version_id: str | None = None, - logstream_id: str | None = None, input: JSONValue = None, output: JSONValue = None, config: JSONObject | None = None, @@ -255,7 +252,6 @@ async def invoke( scorer_label: Preset, registered, or fine-tuned scorer label. scorer_id: Optional Galileo scorer identifier. scorer_version_id: Optional Galileo scorer version identifier. - logstream_id: Optional Galileo log stream identifier for runtime context. input: Optional user/system prompt text. output: Optional model response text. config: Optional scorer-specific configuration. @@ -280,7 +276,6 @@ async def invoke( scorer_label=scorer_label, scorer_id=scorer_id, scorer_version_id=scorer_version_id, - logstream_id=logstream_id, inputs=ScorerInvokeInputs( query="" if input is None else input, response="" if output is None else output ), diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py index 1136ffbe..c49dd716 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py @@ -31,7 +31,6 @@ class LunaEvaluatorConfig(EvaluatorConfig): """Configuration for direct Luna scorer evaluation. Attributes: - logstream_id: Optional Galileo log stream identifier used as runtime context. scorer_label: Preset, registered, or fine-tuned scorer label. scorer_id: Optional Galileo scorer identifier. scorer_version_id: Optional Galileo scorer version identifier. @@ -41,11 +40,6 @@ class LunaEvaluatorConfig(EvaluatorConfig): timeout_ms: Request timeout in milliseconds. """ - logstream_id: str | None = Field( - default=None, - min_length=1, - description="Optional Galileo log stream identifier used as scorer runtime context.", - ) scorer_label: str | None = Field( default=None, min_length=1, diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py index 79749a84..ce46cf44 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -9,7 +9,7 @@ from typing import Any from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator -from agent_control_models import EvaluationContext, EvaluatorResult, JSONValue +from agent_control_models import EvaluatorResult, JSONValue from .client import GalileoLunaClient, ScorerInvokeResponse from .config import LunaEvaluatorConfig, coerce_number @@ -76,10 +76,6 @@ def _confidence_from_score(score: JSONValue) -> float: return 1.0 -def _is_logstream_target(target_type: str | None) -> bool: - return (target_type or "").lower() in {"logstream", "log_stream", "log-stream"} - - @register_evaluator class LunaEvaluator(Evaluator[LunaEvaluatorConfig]): """Galileo Luna evaluator using the direct scorer invocation API.""" @@ -175,35 +171,25 @@ def _score_matches(self, score: JSONValue) -> bool: raise ValueError(f"Unsupported Luna operator: {operator}") async def evaluate(self, data: Any) -> EvaluatorResult: - """Evaluate selected data without runtime context.""" - return await self.evaluate_with_context(data, context=None) - - async def evaluate_with_context( - self, - data: Any, - context: EvaluationContext | None = None, - ) -> EvaluatorResult: """Evaluate selected data with Galileo Luna direct scorer invocation. Args: data: The data selected from the runtime step. - context: Optional runtime context from the engine. Returns: EvaluatorResult with local threshold decision and scorer metadata. """ input_text, output_text = self._prepare_payload(data) - logstream_id = self._resolve_logstream_id(context) if not (_has_text(input_text) or _has_text(output_text)): return EvaluatorResult( matched=False, confidence=1.0, message="No data to score with Luna", - metadata=self._base_metadata(logstream_id=logstream_id), + metadata=self._base_metadata(), ) try: - scorer_kwargs = self._scorer_kwargs(logstream_id=logstream_id) + scorer_kwargs = self._scorer_kwargs() response = await self._get_client().invoke( **scorer_kwargs, input=input_text if _has_text(input_text) else None, @@ -217,7 +203,7 @@ async def evaluate_with_context( raise RuntimeError(message) matched = self._score_matches(response.score) - metadata = self._metadata(response, logstream_id=logstream_id) + metadata = self._metadata(response) operator = self.config.operator threshold = self.config.threshold state = "triggered" if matched else "not triggered" @@ -232,25 +218,18 @@ async def evaluate_with_context( ) except Exception as exc: logger.error("Luna evaluation error: %s", exc, exc_info=True) - return self._handle_error(exc, logstream_id=logstream_id) - - def _resolve_logstream_id(self, context: EvaluationContext | None) -> str | None: - if context is not None and _is_logstream_target(context.target_type): - return context.target_id - return self.config.logstream_id + return self._handle_error(exc) - def _base_metadata(self, *, logstream_id: str | None = None) -> dict[str, Any]: + def _base_metadata(self) -> dict[str, Any]: metadata = { - "logstream_id": logstream_id, "scorer_label": self.config.scorer_label, "scorer_id": self.config.scorer_id, "scorer_version_id": self.config.scorer_version_id, } return {key: value for key, value in metadata.items() if value is not None} - def _scorer_kwargs(self, *, logstream_id: str | None = None) -> dict[str, Any]: + def _scorer_kwargs(self) -> dict[str, Any]: kwargs = { - "logstream_id": logstream_id, "scorer_label": self.config.scorer_label, "scorer_id": self.config.scorer_id, "scorer_version_id": self.config.scorer_version_id, @@ -260,10 +239,8 @@ def _scorer_kwargs(self, *, logstream_id: str | None = None) -> dict[str, Any]: def _metadata( self, response: ScorerInvokeResponse, - *, - logstream_id: str | None = None, ) -> dict[str, Any]: - metadata: dict[str, Any] = self._base_metadata(logstream_id=logstream_id) + metadata: dict[str, Any] = self._base_metadata() metadata.update({ "scorer_label": response.scorer_label or self.config.scorer_label, "score": response.score, @@ -278,8 +255,6 @@ def _metadata( def _handle_error( self, error: Exception, - *, - logstream_id: str | None = None, ) -> EvaluatorResult: error_detail = str(error) return EvaluatorResult( @@ -292,7 +267,6 @@ def _handle_error( "scorer_label": self.config.scorer_label, "scorer_id": self.config.scorer_id, "scorer_version_id": self.config.scorer_version_id, - "logstream_id": logstream_id, }, error=error_detail, ) diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 201d9f73..1b7a6e94 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -27,7 +27,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None: # Given: a direct scorer config with local thresholding config = LunaEvaluatorConfig( - logstream_id="logstream-123", scorer_label="toxicity", scorer_id="scorer-123", scorer_version_id="version-123", @@ -37,7 +36,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None: ) # Then: config is retained without Protect concepts - assert config.logstream_id == "logstream-123" assert config.scorer_label == "toxicity" assert config.scorer_id == "scorer-123" assert config.scorer_version_id == "version-123" @@ -75,7 +73,6 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None: # Given: a scorer request with scorer config request = ScorerInvokeRequest( - logstream_id="logstream-123", scorer_label="toxicity", scorer_id="scorer-123", scorer_version_id="version-123", @@ -85,7 +82,6 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None: # Then: the serialized payload uses the API-owned scorer invoke fields assert request.to_dict() == { - "logstream_id": "logstream-123", "scorer_label": "toxicity", "scorer_id": "scorer-123", "scorer_version_id": "version-123", @@ -409,46 +405,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None: timeout=5.0, ) - @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @pytest.mark.asyncio - async def test_evaluator_passes_logstream_id_from_runtime_context(self) -> None: - from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse - from agent_control_evaluator_galileo.luna.client import GalileoLunaClient - from agent_control_models import EvaluationContext - - evaluator = LunaEvaluator.from_dict( - { - "logstream_id": "config-logstream", - "scorer_label": "toxicity", - "scorer_id": "scorer-123", - "scorer_version_id": "version-123", - } - ) - - with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: - mock_invoke.return_value = ScorerInvokeResponse( - scorer_label="toxicity", - score=0.82, - status="success", - ) - - result = await evaluator.evaluate_with_context( - "hello", - EvaluationContext(target_type="log_stream", target_id="runtime-logstream"), - ) - - assert result.metadata["logstream_id"] == "runtime-logstream" - mock_invoke.assert_awaited_once_with( - logstream_id="runtime-logstream", - scorer_label="toxicity", - scorer_id="scorer-123", - scorer_version_id="version-123", - input="hello", - output=None, - config=None, - timeout=10.0, - ) - @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) @pytest.mark.asyncio async def test_evaluator_returns_non_match_below_threshold(self) -> None: diff --git a/models/src/agent_control_models/__init__.py b/models/src/agent_control_models/__init__.py index 867367bd..148cdd7a 100644 --- a/models/src/agent_control_models/__init__.py +++ b/models/src/agent_control_models/__init__.py @@ -61,7 +61,6 @@ make_error_type, ) from .evaluation import ( - EvaluationContext, EvaluationRequest, EvaluationResponse, EvaluationResult, @@ -133,7 +132,6 @@ # Policy "Policy", # Evaluation - "EvaluationContext", "EvaluationRequest", "EvaluationResponse", "EvaluationResult", diff --git a/models/src/agent_control_models/evaluation.py b/models/src/agent_control_models/evaluation.py index 5c81b0c1..50b5791b 100644 --- a/models/src/agent_control_models/evaluation.py +++ b/models/src/agent_control_models/evaluation.py @@ -1,5 +1,4 @@ """Evaluation-related models.""" -from dataclasses import dataclass from typing import Annotated, Literal, Self from pydantic import Field, StringConstraints, field_validator, model_validator @@ -14,22 +13,6 @@ ] -@dataclass -class EvaluationContext: - """Runtime context available while evaluating a control. - - This is intentionally small and mutable by normal dataclass semantics so - downstream users can subclass it with richer runtime context when needed. - Only fields the engine actually populates today are declared here; add new - fields only when there is a populator on every supported call path. - """ - - target_type: str | None = None - target_id: str | None = None - agent_name: str | None = None - step_type: str | None = None - - class EvaluationRequest(BaseModel): """ Request model for evaluation analysis. From a06d3f10edbd66fcd6d877d94b02545fb11cd241 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Fri, 15 May 2026 14:59:33 -0700 Subject: [PATCH 14/20] add tests for coverage --- .../builtin/tests/list/test_list_extra.py | 63 ++++++ evaluators/builtin/tests/regex/__init__.py | 0 evaluators/builtin/tests/regex/test_regex.py | 115 +++++++++++ .../tests/sql/test_sql_config_validation.py | 103 ++++++++++ evaluators/builtin/tests/test_discovery.py | 187 ++++++++++++++++++ evaluators/builtin/tests/test_factory.py | 172 ++++++++++++++++ evaluators/builtin/tests/test_registry.py | 119 +++++++++++ 7 files changed, 759 insertions(+) create mode 100644 evaluators/builtin/tests/list/test_list_extra.py create mode 100644 evaluators/builtin/tests/regex/__init__.py create mode 100644 evaluators/builtin/tests/regex/test_regex.py create mode 100644 evaluators/builtin/tests/sql/test_sql_config_validation.py create mode 100644 evaluators/builtin/tests/test_discovery.py create mode 100644 evaluators/builtin/tests/test_factory.py create mode 100644 evaluators/builtin/tests/test_registry.py diff --git a/evaluators/builtin/tests/list/test_list_extra.py b/evaluators/builtin/tests/list/test_list_extra.py new file mode 100644 index 00000000..ff8fe90a --- /dev/null +++ b/evaluators/builtin/tests/list/test_list_extra.py @@ -0,0 +1,63 @@ +"""Targeted tests covering match_mode branches and edge-case messages.""" + +from __future__ import annotations + +import pytest +from agent_control_evaluators.list.config import ListEvaluatorConfig +from agent_control_evaluators.list.evaluator import ListEvaluator + + +@pytest.mark.asyncio +async def test_match_mode_contains_uses_word_boundary(): + """contains mode matches whole words but rejects sub-word matches.""" + config = ListEvaluatorConfig(values=["admin"], match_mode="contains") + evaluator = ListEvaluator(config) + + matched = await evaluator.evaluate("the admin user logged in") + assert matched.matched is True + + not_matched = await evaluator.evaluate("administrator") # sub-word, no boundary + assert not_matched.matched is False + + +@pytest.mark.asyncio +async def test_match_mode_exact_is_the_default(): + """No explicit mode uses anchored exact matching.""" + config = ListEvaluatorConfig(values=["admin"]) + evaluator = ListEvaluator(config) + + exact = await evaluator.evaluate("admin") + assert exact.matched is True + + partial = await evaluator.evaluate("admin user") # not anchored end + assert partial.matched is False + + +@pytest.mark.asyncio +async def test_data_none_returns_empty_input_message(): + """None input is treated as empty and the control is ignored.""" + config = ListEvaluatorConfig(values=["x"]) + evaluator = ListEvaluator(config) + + result = await evaluator.evaluate(None) + + assert result.matched is False + assert result.message == "Empty input - control ignored" + assert result.metadata["input_count"] == 0 + + +@pytest.mark.asyncio +async def test_message_truncates_match_list_at_five(): + """More than five matches collapse into a ``(+N more)`` suffix.""" + config = ListEvaluatorConfig( + values=["a", "b", "c", "d", "e", "f", "g"], + logic="any", + ) + evaluator = ListEvaluator(config) + + result = await evaluator.evaluate(["a", "b", "c", "d", "e", "f", "g"]) + + assert result.matched is True + # First five matches appear, the rest summarized. + assert "a, b, c, d, e" in result.message + assert "(+2 more)" in result.message diff --git a/evaluators/builtin/tests/regex/__init__.py b/evaluators/builtin/tests/regex/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluators/builtin/tests/regex/test_regex.py b/evaluators/builtin/tests/regex/test_regex.py new file mode 100644 index 00000000..9df69560 --- /dev/null +++ b/evaluators/builtin/tests/regex/test_regex.py @@ -0,0 +1,115 @@ +"""Tests for the regex evaluator and its config validation.""" + +from __future__ import annotations + +import pytest +from agent_control_evaluators.regex.config import RegexEvaluatorConfig +from agent_control_evaluators.regex.evaluator import RegexEvaluator + + +class TestRegexConfig: + """Pattern validation rejects invalid RE2 syntax at config time.""" + + def test_valid_pattern_accepted(self): + config = RegexEvaluatorConfig(pattern=r"\d{3}-\d{2}-\d{4}") + assert config.pattern == r"\d{3}-\d{2}-\d{4}" + + def test_empty_pattern_accepted(self): + # Empty string is technically a valid RE2 pattern (matches everything). + config = RegexEvaluatorConfig(pattern="") + assert config.pattern == "" + + def test_invalid_pattern_rejected(self): + with pytest.raises(ValueError, match="Invalid regex pattern"): + RegexEvaluatorConfig(pattern="[invalid(regex") + + def test_flags_default_to_none(self): + config = RegexEvaluatorConfig(pattern=r"\d+") + assert config.flags is None + + def test_flags_can_be_specified(self): + config = RegexEvaluatorConfig(pattern="secret", flags=["IGNORECASE"]) + assert config.flags == ["IGNORECASE"] + + +class TestRegexEvaluator: + """Pattern matching against arbitrary data.""" + + @pytest.mark.asyncio + async def test_match_returns_matched_true(self): + evaluator = RegexEvaluator.from_dict({"pattern": r"\d{3}-\d{4}"}) + + result = await evaluator.evaluate("call 555-1234 today") + + assert result.matched is True + assert result.confidence == 1.0 + assert "found" in result.message + assert result.metadata["pattern"] == r"\d{3}-\d{4}" + + @pytest.mark.asyncio + async def test_no_match_returns_matched_false(self): + evaluator = RegexEvaluator.from_dict({"pattern": r"\d{3}-\d{4}"}) + + result = await evaluator.evaluate("no numbers here") + + assert result.matched is False + assert "not found" in result.message + + @pytest.mark.asyncio + async def test_none_data_returns_no_data_message(self): + evaluator = RegexEvaluator.from_dict({"pattern": r".*"}) + + result = await evaluator.evaluate(None) + + assert result.matched is False + assert result.message == "No data to match" + + @pytest.mark.asyncio + async def test_non_string_data_is_coerced(self): + """Non-string inputs are stringified before matching.""" + evaluator = RegexEvaluator.from_dict({"pattern": r"^42$"}) + + result = await evaluator.evaluate(42) + + assert result.matched is True + + @pytest.mark.asyncio + async def test_ignorecase_flag_short_form(self): + """The ``I`` short form is treated the same as ``IGNORECASE``.""" + evaluator = RegexEvaluator.from_dict( + {"pattern": "SECRET", "flags": ["I"]}, + ) + + result = await evaluator.evaluate("the secret value") + + assert result.matched is True + + @pytest.mark.asyncio + async def test_ignorecase_flag_long_form(self): + evaluator = RegexEvaluator.from_dict( + {"pattern": "secret", "flags": ["IGNORECASE"]}, + ) + + result = await evaluator.evaluate("THE SECRET VALUE") + + assert result.matched is True + + @pytest.mark.asyncio + async def test_unknown_flag_is_ignored(self): + """RE2 supports a narrow flag set; unknown flag names must not raise.""" + evaluator = RegexEvaluator.from_dict( + {"pattern": "x", "flags": ["MULTILINE"]}, + ) + + result = await evaluator.evaluate("xyz") + + # Should still work — unknown flag is silently dropped, not an error. + assert result.matched is True + + @pytest.mark.asyncio + async def test_case_sensitive_by_default(self): + evaluator = RegexEvaluator.from_dict({"pattern": "Secret"}) + + result = await evaluator.evaluate("the secret value") + + assert result.matched is False diff --git a/evaluators/builtin/tests/sql/test_sql_config_validation.py b/evaluators/builtin/tests/sql/test_sql_config_validation.py new file mode 100644 index 00000000..8842ed4f --- /dev/null +++ b/evaluators/builtin/tests/sql/test_sql_config_validation.py @@ -0,0 +1,103 @@ +"""Targeted tests for SQLEvaluatorConfig validate_config branches.""" + +from __future__ import annotations + +import warnings + +import pytest +from agent_control_evaluators.sql.config import SQLEvaluatorConfig + + +class TestConflictingRestrictions: + """Mutually-exclusive allow/block lists must be rejected at config time.""" + + def test_blocked_and_allowed_operations_conflict(self): + with pytest.raises(ValueError, match="blocked_operations and allowed_operations"): + SQLEvaluatorConfig( + blocked_operations=["DELETE"], + allowed_operations=["SELECT"], + ) + + def test_blocked_and_allowed_tables_conflict(self): + with pytest.raises(ValueError, match="allowed_tables and blocked_tables"): + SQLEvaluatorConfig( + allowed_tables=["users"], + blocked_tables=["secrets"], + ) + + def test_blocked_and_allowed_schemas_conflict(self): + with pytest.raises(ValueError, match="allowed_schemas and blocked_schemas"): + SQLEvaluatorConfig( + allowed_schemas=["public"], + blocked_schemas=["internal"], + ) + + +class TestLimitBounds: + """Numeric controls must be positive.""" + + def test_max_limit_must_be_positive(self): + with pytest.raises(ValueError, match="max_limit must be a positive integer"): + SQLEvaluatorConfig(max_limit=0) + + def test_max_limit_negative_rejected(self): + with pytest.raises(ValueError, match="max_limit must be a positive integer"): + SQLEvaluatorConfig(max_limit=-5) + + def test_max_statements_must_be_positive(self): + with pytest.raises(ValueError, match="max_statements must be a positive integer"): + SQLEvaluatorConfig( + allow_multi_statements=True, + max_statements=0, + ) + + +class TestColumnControls: + """Column-level validators cover required_column_values shape rules.""" + + def test_column_context_without_required_columns_warns(self): + with pytest.warns(UserWarning, match="column_context is set but required_columns"): + SQLEvaluatorConfig(column_context="where") + + def test_required_column_values_rejects_empty_column_ref(self): + with pytest.raises(ValueError, match="empty column reference"): + SQLEvaluatorConfig( + required_columns=["tenant_id"], + required_column_values={" ": "tenant_id"}, + ) + + def test_required_column_values_rejects_malformed_qualified_ref(self): + with pytest.raises( + ValueError, match="'table.column' format when qualified" + ): + SQLEvaluatorConfig( + required_columns=["tenant_id"], + required_column_values={"users.": "tenant_id"}, + ) + + def test_required_column_values_rejects_blank_qualified_table_side(self): + with pytest.raises( + ValueError, match="'table.column' format when qualified" + ): + SQLEvaluatorConfig( + required_columns=["tenant_id"], + required_column_values={".tenant_id": "tenant_id"}, + ) + + def test_required_column_values_rejects_empty_context_key(self): + with pytest.raises(ValueError, match="empty context key"): + SQLEvaluatorConfig( + required_columns=["tenant_id"], + required_column_values={"users.tenant_id": " "}, + ) + + def test_valid_required_column_values_accepted(self): + """Sanity check: a valid combination passes without raising.""" + with warnings.catch_warnings(): + warnings.simplefilter("error") # promote any warning to a failure + config = SQLEvaluatorConfig( + required_columns=["tenant_id"], + column_context="where", + required_column_values={"users.tenant_id": "tenant_id"}, + ) + assert config.required_column_values == {"users.tenant_id": "tenant_id"} diff --git a/evaluators/builtin/tests/test_discovery.py b/evaluators/builtin/tests/test_discovery.py new file mode 100644 index 00000000..62876412 --- /dev/null +++ b/evaluators/builtin/tests/test_discovery.py @@ -0,0 +1,187 @@ +"""Tests for entry-point-based evaluator discovery.""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest +from agent_control_evaluators import ( + Evaluator, + EvaluatorConfig, + EvaluatorMetadata, + clear_evaluators, + discover_evaluators, + ensure_evaluators_discovered, + get_all_evaluators, + list_evaluators, + register_evaluator, + reset_evaluator_discovery, +) +from agent_control_evaluators import _discovery as discovery_module +from agent_control_models import EvaluatorResult + + +class _DiscoveryConfig(EvaluatorConfig): + pass + + +def _make_class(*, name: str, available: bool = True) -> type[Evaluator[_DiscoveryConfig]]: + class _Dummy(Evaluator[_DiscoveryConfig]): + metadata = EvaluatorMetadata(name=name, version="1.0.0", description="") + config_model = _DiscoveryConfig + + @classmethod + def is_available(cls) -> bool: + return available + + async def evaluate(self, data: Any) -> EvaluatorResult: + return EvaluatorResult(matched=False, confidence=1.0, message="") + + _Dummy.__name__ = f"Discovery_{name.replace('-', '_')}" + return _Dummy + + +@pytest.fixture +def isolated_discovery(): + """Snapshot registry + discovery flag, restore on teardown.""" + snapshot = dict(get_all_evaluators()) + clear_evaluators() + reset_evaluator_discovery() + yield + clear_evaluators() + reset_evaluator_discovery() + for cls in snapshot.values(): + register_evaluator(cls) + + +def _make_fake_entry_point(name: str, evaluator_class: type[Any]) -> MagicMock: + """Build a MagicMock that mimics importlib.metadata.EntryPoint.""" + ep = MagicMock() + ep.name = name + ep.load.return_value = evaluator_class + return ep + + +def test_discover_evaluators_registers_available_classes(isolated_discovery): + """Discover walks the entry-point group and registers each available class.""" + cls = _make_class(name="disc-a") + fake_ep = _make_fake_entry_point("disc-a", cls) + + with patch.object(discovery_module, "entry_points", return_value=[fake_ep]): + count = discover_evaluators() + + assert count == 1 + assert get_all_evaluators().get("disc-a") is cls + + +def test_discover_evaluators_skips_unavailable_classes(isolated_discovery): + """Evaluators whose is_available() is False must NOT be registered.""" + cls = _make_class(name="disc-unavailable", available=False) + fake_ep = _make_fake_entry_point("disc-unavailable", cls) + + with patch.object(discovery_module, "entry_points", return_value=[fake_ep]): + count = discover_evaluators() + + assert count == 0 + assert "disc-unavailable" not in get_all_evaluators() + + +def test_discover_evaluators_skips_already_registered(isolated_discovery): + """Already-registered names are skipped without raising.""" + cls = _make_class(name="disc-existing") + register_evaluator(cls) + + fake_ep = _make_fake_entry_point("disc-existing", cls) + with patch.object(discovery_module, "entry_points", return_value=[fake_ep]): + count = discover_evaluators() + + assert count == 0 + + +def test_discover_evaluators_only_runs_once(isolated_discovery): + """Repeat calls short-circuit on the _DISCOVERY_COMPLETE flag.""" + cls = _make_class(name="disc-once") + fake_ep = _make_fake_entry_point("disc-once", cls) + + with patch.object( + discovery_module, "entry_points", return_value=[fake_ep] + ) as patched: + first = discover_evaluators() + second = discover_evaluators() + + # First call discovers, second returns 0 without consulting entry_points. + assert first == 1 + assert second == 0 + assert patched.call_count == 1 + + +def test_discover_evaluators_swallows_load_failures(isolated_discovery): + """A broken entry point is logged and skipped, not propagated.""" + bad_ep = MagicMock() + bad_ep.name = "broken" + bad_ep.load.side_effect = RuntimeError("boom") + + good_cls = _make_class(name="disc-good") + good_ep = _make_fake_entry_point("disc-good", good_cls) + + with patch.object(discovery_module, "entry_points", return_value=[bad_ep, good_ep]): + count = discover_evaluators() + + assert count == 1 + assert get_all_evaluators().get("disc-good") is good_cls + + +def test_discover_evaluators_handles_entry_points_failure(isolated_discovery): + """If entry_points() itself raises, discovery completes with zero results.""" + with patch.object( + discovery_module, + "entry_points", + side_effect=RuntimeError("entry-point system unavailable"), + ): + count = discover_evaluators() + + assert count == 0 + + +def test_reset_evaluator_discovery_allows_rerun(isolated_discovery): + """reset_evaluator_discovery clears the completed flag so discover runs again.""" + cls = _make_class(name="disc-reset") + fake_ep = _make_fake_entry_point("disc-reset", cls) + + with patch.object( + discovery_module, "entry_points", return_value=[fake_ep] + ) as patched: + discover_evaluators() + clear_evaluators() + reset_evaluator_discovery() + count = discover_evaluators() + + assert count == 1 + assert patched.call_count == 2 + + +def test_ensure_evaluators_discovered_runs_once(isolated_discovery): + """ensure_evaluators_discovered is the lazy-init entry point.""" + cls = _make_class(name="disc-ensure") + fake_ep = _make_fake_entry_point("disc-ensure", cls) + + with patch.object( + discovery_module, "entry_points", return_value=[fake_ep] + ) as patched: + ensure_evaluators_discovered() + ensure_evaluators_discovered() + + assert patched.call_count == 1 + assert get_all_evaluators().get("disc-ensure") is cls + + +def test_list_evaluators_triggers_discovery(isolated_discovery): + """list_evaluators is the convenience accessor; it must trigger discovery.""" + cls = _make_class(name="disc-list") + fake_ep = _make_fake_entry_point("disc-list", cls) + + with patch.object(discovery_module, "entry_points", return_value=[fake_ep]): + result = list_evaluators() + + assert result.get("disc-list") is cls diff --git a/evaluators/builtin/tests/test_factory.py b/evaluators/builtin/tests/test_factory.py new file mode 100644 index 00000000..4bba4b82 --- /dev/null +++ b/evaluators/builtin/tests/test_factory.py @@ -0,0 +1,172 @@ +"""Tests for the LRU-cached evaluator factory.""" + +from __future__ import annotations + +import importlib +from typing import Any + +import pytest +from agent_control_evaluators import ( + Evaluator, + EvaluatorConfig, + EvaluatorMetadata, + clear_evaluator_cache, + clear_evaluators, + get_all_evaluators, + get_evaluator_instance, + register_evaluator, +) +from agent_control_evaluators import _factory as factory_module +from agent_control_models import EvaluatorResult, EvaluatorSpec + + +class _FactoryConfig(EvaluatorConfig): + payload: str = "default" + + +class _FactoryEvaluator(Evaluator[_FactoryConfig]): + metadata = EvaluatorMetadata(name="factory-dummy", version="1.0.0", description="") + config_model = _FactoryConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + return EvaluatorResult(matched=False, confidence=1.0, message="") + + +@pytest.fixture +def isolated_factory(): + """Snapshot registry/cache so factory tests don't leak state.""" + snapshot = dict(get_all_evaluators()) + clear_evaluators() + clear_evaluator_cache() + register_evaluator(_FactoryEvaluator) + yield + clear_evaluator_cache() + clear_evaluators() + for cls in snapshot.values(): + register_evaluator(cls) + + +def test_get_evaluator_instance_returns_evaluator(isolated_factory): + spec = EvaluatorSpec(name="factory-dummy", config={"payload": "p1"}) + + instance = get_evaluator_instance(spec) + + assert isinstance(instance, _FactoryEvaluator) + assert instance.config.payload == "p1" + + +def test_get_evaluator_instance_caches_by_config(isolated_factory): + spec_a = EvaluatorSpec(name="factory-dummy", config={"payload": "same"}) + spec_b = EvaluatorSpec(name="factory-dummy", config={"payload": "same"}) + + first = get_evaluator_instance(spec_a) + second = get_evaluator_instance(spec_b) + + # Same config = same cached instance. + assert first is second + + +def test_get_evaluator_instance_treats_different_configs_separately(isolated_factory): + spec_a = EvaluatorSpec(name="factory-dummy", config={"payload": "a"}) + spec_b = EvaluatorSpec(name="factory-dummy", config={"payload": "b"}) + + instance_a = get_evaluator_instance(spec_a) + instance_b = get_evaluator_instance(spec_b) + + assert instance_a is not instance_b + assert instance_a.config.payload == "a" + assert instance_b.config.payload == "b" + + +def test_get_evaluator_instance_raises_for_unknown_evaluator(isolated_factory): + with pytest.raises(ValueError, match="not found"): + get_evaluator_instance(EvaluatorSpec(name="no-such-evaluator", config={})) + + +def test_clear_evaluator_cache_forces_recreation(isolated_factory): + spec = EvaluatorSpec(name="factory-dummy", config={"payload": "p"}) + + first = get_evaluator_instance(spec) + clear_evaluator_cache() + second = get_evaluator_instance(spec) + + assert first is not second + + +def test_get_evaluator_instance_evicts_oldest_when_full(isolated_factory, monkeypatch): + """LRU eviction: when cache is full, the least-recently-used entry is dropped.""" + # Force a tiny cache so we can observe eviction without overhead. + monkeypatch.setattr(factory_module, "EVALUATOR_CACHE_SIZE", 2) + + spec_a = EvaluatorSpec(name="factory-dummy", config={"payload": "a"}) + spec_b = EvaluatorSpec(name="factory-dummy", config={"payload": "b"}) + spec_c = EvaluatorSpec(name="factory-dummy", config={"payload": "c"}) + + first_a = get_evaluator_instance(spec_a) + get_evaluator_instance(spec_b) + # Insert third → "a" is the LRU and must be evicted. + get_evaluator_instance(spec_c) + + re_a = get_evaluator_instance(spec_a) + # "a" was evicted: new instance must NOT be the original. + assert re_a is not first_a + + +def test_get_evaluator_instance_moves_hit_to_most_recent( + isolated_factory, monkeypatch +): + """Cache hit must refresh LRU recency so the touched entry isn't evicted next.""" + monkeypatch.setattr(factory_module, "EVALUATOR_CACHE_SIZE", 2) + + spec_a = EvaluatorSpec(name="factory-dummy", config={"payload": "a"}) + spec_b = EvaluatorSpec(name="factory-dummy", config={"payload": "b"}) + spec_c = EvaluatorSpec(name="factory-dummy", config={"payload": "c"}) + + first_a = get_evaluator_instance(spec_a) + get_evaluator_instance(spec_b) + # Touch "a" so "b" becomes the LRU. + re_a = get_evaluator_instance(spec_a) + assert re_a is first_a + + # Inserting "c" should evict "b", not "a". + get_evaluator_instance(spec_c) + + refetched_a = get_evaluator_instance(spec_a) + assert refetched_a is first_a # still cached + + +def test_parse_cache_size_uses_default_when_unset(monkeypatch): + monkeypatch.delenv("EVALUATOR_CACHE_SIZE", raising=False) + reloaded = importlib.reload(factory_module) + try: + assert reloaded.EVALUATOR_CACHE_SIZE == factory_module.DEFAULT_CACHE_SIZE + finally: + importlib.reload(factory_module) + + +def test_parse_cache_size_falls_back_on_invalid_value(monkeypatch): + monkeypatch.setenv("EVALUATOR_CACHE_SIZE", "not-a-number") + reloaded = importlib.reload(factory_module) + try: + assert reloaded.EVALUATOR_CACHE_SIZE == reloaded.DEFAULT_CACHE_SIZE + finally: + importlib.reload(factory_module) + + +def test_parse_cache_size_clamps_to_minimum(monkeypatch): + monkeypatch.setenv("EVALUATOR_CACHE_SIZE", "0") + reloaded = importlib.reload(factory_module) + try: + # Anything below MIN_CACHE_SIZE is clamped to avoid infinite eviction loops. + assert reloaded.EVALUATOR_CACHE_SIZE >= reloaded.MIN_CACHE_SIZE + finally: + importlib.reload(factory_module) + + +def test_parse_cache_size_accepts_valid_int(monkeypatch): + monkeypatch.setenv("EVALUATOR_CACHE_SIZE", "42") + reloaded = importlib.reload(factory_module) + try: + assert reloaded.EVALUATOR_CACHE_SIZE == 42 + finally: + importlib.reload(factory_module) diff --git a/evaluators/builtin/tests/test_registry.py b/evaluators/builtin/tests/test_registry.py new file mode 100644 index 00000000..6b663129 --- /dev/null +++ b/evaluators/builtin/tests/test_registry.py @@ -0,0 +1,119 @@ +"""Tests for the in-memory evaluator registry.""" + +from __future__ import annotations + +from typing import Any + +import pytest +from agent_control_evaluators import ( + Evaluator, + EvaluatorConfig, + EvaluatorMetadata, + clear_evaluators, + get_all_evaluators, + get_evaluator, + register_evaluator, +) +from agent_control_models import EvaluatorResult + + +class _DummyConfig(EvaluatorConfig): + pass + + +def _make_class(*, name: str, available: bool = True) -> type[Evaluator[_DummyConfig]]: + """Build a fresh Evaluator subclass with the supplied metadata name.""" + + class _Dummy(Evaluator[_DummyConfig]): + metadata = EvaluatorMetadata( + name=name, + version="1.0.0", + description="", + ) + config_model = _DummyConfig + + @classmethod + def is_available(cls) -> bool: + return available + + async def evaluate(self, data: Any) -> EvaluatorResult: + return EvaluatorResult(matched=False, confidence=1.0, message="") + + _Dummy.__name__ = f"Dummy_{name.replace('-', '_')}" + return _Dummy + + +@pytest.fixture +def isolated_registry(): + """Snapshot and restore the global registry so tests don't leak state.""" + snapshot = dict(get_all_evaluators()) + clear_evaluators() + yield + clear_evaluators() + for cls in snapshot.values(): + register_evaluator(cls) + + +def test_register_and_lookup_evaluator(isolated_registry): + cls = _make_class(name="reg-a") + + register_evaluator(cls) + + assert get_evaluator("reg-a") is cls + + +def test_get_evaluator_returns_none_when_not_registered(isolated_registry): + assert get_evaluator("does-not-exist") is None + + +def test_get_all_evaluators_returns_copy(isolated_registry): + cls = _make_class(name="reg-copy") + register_evaluator(cls) + + snapshot = get_all_evaluators() + snapshot["evil"] = cls # mutate the returned dict + + # Internal registry must not reflect external mutation. + assert "evil" not in get_all_evaluators() + + +def test_register_is_idempotent_for_same_class(isolated_registry): + cls = _make_class(name="reg-idem") + + register_evaluator(cls) + # Registering the exact same class again must not raise. + assert register_evaluator(cls) is cls + + +def test_register_rejects_name_collision_with_different_class(isolated_registry): + first = _make_class(name="reg-conflict") + second = _make_class(name="reg-conflict") + register_evaluator(first) + + with pytest.raises(ValueError, match="already registered"): + register_evaluator(second) + + +def test_register_skips_unavailable_evaluators(isolated_registry): + cls = _make_class(name="reg-unavailable", available=False) + + # Should not raise and should not register. + assert register_evaluator(cls) is cls + assert get_evaluator("reg-unavailable") is None + + +def test_clear_evaluators_empties_registry(isolated_registry): + register_evaluator(_make_class(name="reg-c1")) + register_evaluator(_make_class(name="reg-c2")) + assert len(get_all_evaluators()) == 2 + + clear_evaluators() + + assert get_all_evaluators() == {} + + +def test_register_decorator_returns_class(isolated_registry): + cls = _make_class(name="reg-decorator") + # The function is documented as decorator-compatible: it must return the class. + decorated = register_evaluator(cls) + assert decorated is cls From c8662f72fe1389d8cede20e694317a3c0a784fb7 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Sat, 16 May 2026 06:34:08 -0700 Subject: [PATCH 15/20] coverage --- .../galileo/tests/test_luna_coverage_gaps.py | 567 ++++++++++++++++++ .../tests/test_evaluators_optional_imports.py | 93 +++ 2 files changed, 660 insertions(+) create mode 100644 evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py create mode 100644 sdks/python/tests/test_evaluators_optional_imports.py diff --git a/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py b/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py new file mode 100644 index 00000000..f4d0e360 --- /dev/null +++ b/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py @@ -0,0 +1,567 @@ +"""Targeted tests filling coverage gaps in luna/evaluator.py and luna/client.py. + +These tests cover the small utility functions and rare branches that the +integration-style tests in ``test_luna_evaluator.py`` skip past. +""" + +from __future__ import annotations + +import json +import os +from unittest.mock import AsyncMock, MagicMock, patch + +import httpx +import pytest + + +# ============================================================================= +# luna/evaluator.py: utility helpers +# ============================================================================= + + +class TestCoercePayloadText: + """``_coerce_payload_text`` normalises arbitrary values to strings.""" + + def test_none_returns_none(self): + from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text + + assert _coerce_payload_text(None) is None + + def test_string_passed_through(self): + from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text + + assert _coerce_payload_text("hello") == "hello" + + @pytest.mark.parametrize("value", [42, 3.14, True]) + def test_scalars_stringified(self, value): + from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text + + assert _coerce_payload_text(value) == str(value) + + def test_dict_is_json_serialized(self): + from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text + + result = _coerce_payload_text({"a": 1, "b": 2}) + + assert json.loads(result) == {"a": 1, "b": 2} + + def test_unserialisable_falls_back_to_str(self): + from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text + + class CannotJson: + def __repr__(self): + return "" + + # json.dumps with default=str would actually serialize this, so use + # something that breaks both the JSON pass AND triggers TypeError. + cannot = CannotJson() + result = _coerce_payload_text({"obj": cannot}) + + # default=str converts the inner object, so we still get a JSON string. + assert isinstance(result, str) + + +class TestExtractDictText: + """``_extract_dict_text`` returns ``None`` for missing keys.""" + + def test_missing_key_returns_none(self): + from agent_control_evaluator_galileo.luna.evaluator import _extract_dict_text + + assert _extract_dict_text({}, "absent") is None + + def test_present_key_coerced(self): + from agent_control_evaluator_galileo.luna.evaluator import _extract_dict_text + + assert _extract_dict_text({"x": 7}, "x") == "7" + + +class TestContains: + """``_contains`` supports str/list/dict scores against a threshold.""" + + def test_none_threshold_is_no_match(self): + from agent_control_evaluator_galileo.luna.evaluator import _contains + + assert _contains("anything", None) is False + + def test_string_contains_substring(self): + from agent_control_evaluator_galileo.luna.evaluator import _contains + + assert _contains("hello world", "world") is True + assert _contains("hello world", "absent") is False + + def test_list_contains_value(self): + from agent_control_evaluator_galileo.luna.evaluator import _contains + + assert _contains(["a", "b", "c"], "b") is True + assert _contains(["a", "b", "c"], "z") is False + + def test_dict_threshold_matches_key(self): + from agent_control_evaluator_galileo.luna.evaluator import _contains + + assert _contains({"toxicity": 0.9}, "toxicity") is True + + def test_dict_threshold_matches_value(self): + from agent_control_evaluator_galileo.luna.evaluator import _contains + + assert _contains({"label": "flagged"}, "flagged") is True + + def test_other_types_return_false(self): + from agent_control_evaluator_galileo.luna.evaluator import _contains + + # Non-iterable score => no match. + assert _contains(42, 42) is False + + +class TestConfidenceFromScore: + """``_confidence_from_score`` maps a raw score to [0, 1].""" + + def test_true_bool_maps_to_one(self): + from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score + + assert _confidence_from_score(True) == 1.0 + + def test_false_bool_maps_to_zero(self): + from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score + + assert _confidence_from_score(False) == 0.0 + + def test_in_range_number_returned_as_is(self): + from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score + + assert _confidence_from_score(0.42) == 0.42 + + def test_out_of_range_falls_back_to_one(self): + from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score + + # Above 1.0 → fall back to default confidence + assert _confidence_from_score(7.2) == 1.0 + + def test_non_numeric_falls_back_to_one(self): + from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score + + assert _confidence_from_score("not-a-number") == 1.0 + + +# ============================================================================= +# luna/evaluator.py: _score_matches operator branches +# ============================================================================= + + +@pytest.fixture +def luna_evaluator(monkeypatch): + """A ready-to-use LunaEvaluator instance with auth env wired up.""" + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna import LunaEvaluator + + return LunaEvaluator.from_dict( + {"scorer_label": "toxicity", "threshold": 0.5, "operator": "gte"} + ) + + +class TestScoreMatchesOperators: + """Every operator branch in ``_score_matches`` should evaluate.""" + + def _make(self, operator, threshold, monkeypatch): + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna import LunaEvaluator + + if operator in {"eq", "ne", "contains"}: + threshold_value = threshold + else: + threshold_value = threshold + return LunaEvaluator.from_dict( + {"scorer_label": "toxicity", "threshold": threshold_value, "operator": operator} + ) + + def test_any_truthy_score_matches(self, monkeypatch): + evaluator = self._make("any", 0.5, monkeypatch) + assert evaluator._score_matches(1) is True + assert evaluator._score_matches(0) is False + + def test_eq_matches_threshold(self, monkeypatch): + evaluator = self._make("eq", "flagged", monkeypatch) + assert evaluator._score_matches("flagged") is True + assert evaluator._score_matches("safe") is False + + def test_ne_matches_when_different(self, monkeypatch): + evaluator = self._make("ne", "flagged", monkeypatch) + assert evaluator._score_matches("safe") is True + assert evaluator._score_matches("flagged") is False + + def test_contains_matches_substring(self, monkeypatch): + evaluator = self._make("contains", "flag", monkeypatch) + assert evaluator._score_matches("flagged") is True + assert evaluator._score_matches("clean") is False + + def test_numeric_operators_all_branches(self, monkeypatch): + for op, expectations in [ + ("gt", [(0.9, True), (0.5, False)]), + ("gte", [(0.5, True), (0.4, False)]), + ("lt", [(0.4, True), (0.5, False)]), + ("lte", [(0.5, True), (0.6, False)]), + ]: + evaluator = self._make(op, 0.5, monkeypatch) + for score, expected in expectations: + assert evaluator._score_matches(score) is expected, (op, score) + + def test_numeric_operator_rejects_non_numeric_score(self, monkeypatch): + evaluator = self._make("gte", 0.5, monkeypatch) + with pytest.raises(ValueError, match="not numeric"): + evaluator._score_matches("not-a-number") + + +# ============================================================================= +# luna/evaluator.py: payload preparation + aclose +# ============================================================================= + + +class TestPreparePayload: + """``_prepare_payload`` routes scalar data based on the scorer label.""" + + def test_scalar_routed_to_input_when_label_lacks_output(self, monkeypatch): + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna import LunaEvaluator + + evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5}) + + input_text, output_text = evaluator._prepare_payload("hello") + + assert input_text == "hello" + assert output_text is None + + def test_scalar_routed_to_output_when_label_contains_output(self, monkeypatch): + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna import LunaEvaluator + + evaluator = LunaEvaluator.from_dict( + {"scorer_label": "output_correctness", "threshold": 0.5} + ) + + input_text, output_text = evaluator._prepare_payload("hello") + + assert input_text is None + assert output_text == "hello" + + +@pytest.mark.asyncio +async def test_evaluator_aclose_closes_underlying_client(monkeypatch): + """``aclose`` must release the HTTP client when one was created.""" + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna import LunaEvaluator + + evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5}) + + fake = MagicMock() + fake.close = AsyncMock() + evaluator._client = fake + + await evaluator.aclose() + + fake.close.assert_awaited_once() + assert evaluator._client is None + + +@pytest.mark.asyncio +async def test_evaluator_handles_non_success_status(monkeypatch): + """A non-success status from the scorer must surface as an error result.""" + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + evaluator = LunaEvaluator.from_dict( + {"scorer_label": "toxicity", "threshold": 0.5, "operator": "gte"} + ) + + with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke: + mock_invoke.return_value = ScorerInvokeResponse( + scorer_label="toxicity", + score=None, + status="failed", + error_message="upstream timeout", + ) + + result = await evaluator.evaluate("hello") + + assert result.matched is False + assert result.error is not None + assert "upstream timeout" in result.error + + +# ============================================================================= +# luna/evaluator.py: package version fallback +# ============================================================================= + + +def test_resolve_package_version_falls_back_when_metadata_missing(): + """The dev fallback must trigger when the package isn't installed by metadata.""" + from importlib.metadata import PackageNotFoundError + + from agent_control_evaluator_galileo.luna import evaluator as evaluator_module + + with patch.object(evaluator_module, "version", side_effect=PackageNotFoundError): + result = evaluator_module._resolve_package_version() + + assert result == "0.0.0.dev" + + +# ============================================================================= +# luna/client.py: small helpers + branches +# ============================================================================= + + +class TestAsFloatOrNone: + """``_as_float_or_none`` parses scalar values; strings may fail.""" + + def test_returns_none_for_bool(self): + from agent_control_evaluator_galileo.luna.client import _as_float_or_none + + assert _as_float_or_none(True) is None + + def test_returns_none_for_none(self): + from agent_control_evaluator_galileo.luna.client import _as_float_or_none + + assert _as_float_or_none(None) is None + + def test_returns_float_for_int(self): + from agent_control_evaluator_galileo.luna.client import _as_float_or_none + + assert _as_float_or_none(7) == 7.0 + + def test_returns_float_for_string_number(self): + from agent_control_evaluator_galileo.luna.client import _as_float_or_none + + assert _as_float_or_none("0.42") == 0.42 + + def test_returns_none_for_unparseable_string(self): + from agent_control_evaluator_galileo.luna.client import _as_float_or_none + + assert _as_float_or_none("not-a-number") is None + + def test_returns_none_for_other_types(self): + from agent_control_evaluator_galileo.luna.client import _as_float_or_none + + assert _as_float_or_none([1, 2]) is None + + +class TestHasValue: + """``_has_value`` is the "is this scorable" predicate.""" + + def test_none_is_empty(self): + from agent_control_evaluator_galileo.luna.client import _has_value + + assert _has_value(None) is False + + def test_empty_string_is_empty(self): + from agent_control_evaluator_galileo.luna.client import _has_value + + assert _has_value("") is False + assert _has_value(" ") is False + + def test_non_empty_string_has_value(self): + from agent_control_evaluator_galileo.luna.client import _has_value + + assert _has_value("hi") is True + + def test_empty_list_or_dict_is_empty(self): + from agent_control_evaluator_galileo.luna.client import _has_value + + assert _has_value([]) is False + assert _has_value({}) is False + + def test_non_empty_list_or_dict_has_value(self): + from agent_control_evaluator_galileo.luna.client import _has_value + + assert _has_value([1]) is True + assert _has_value({"k": "v"}) is True + + def test_scalar_other_types_have_value(self): + from agent_control_evaluator_galileo.luna.client import _has_value + + assert _has_value(42) is True + assert _has_value(0) is True # 0 is a real value, not empty + assert _has_value(True) is True + + +class TestScorerInvokeRequestValidation: + """``ScorerInvokeRequest`` rejects malformed input combos.""" + + def test_missing_all_identifiers_raises(self): + from agent_control_evaluator_galileo.luna.client import ( + ScorerInvokeInputs, + ScorerInvokeRequest, + ) + from pydantic import ValidationError + + with pytest.raises(ValidationError, match="One of scorer_label"): + ScorerInvokeRequest(inputs=ScorerInvokeInputs(query="hello")) + + +def test_client_raises_when_no_credentials(monkeypatch): + """The client requires at least an API secret or an API key.""" + for name in ( + "GALILEO_API_SECRET_KEY", + "GALILEO_API_SECRET", + "GALILEO_API_KEY", + ): + monkeypatch.delenv(name, raising=False) + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY"): + GalileoLunaClient() + + +class TestDeriveApiUrl: + """URL derivation covers every console.* → api.* substitution branch.""" + + def _client(self, monkeypatch): + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + return GalileoLunaClient() + + def test_console_dot_rewritten_to_api_dot(self, monkeypatch): + client = self._client(monkeypatch) + assert ( + client._derive_api_url("https://console.galileo.ai") + == "https://api.galileo.ai" + ) + + def test_console_dash_rewritten_to_api_dash(self, monkeypatch): + client = self._client(monkeypatch) + assert ( + client._derive_api_url("https://console-staging.galileo.ai") + == "https://api-staging.galileo.ai" + ) + + def test_plain_https_host_gets_api_prefix(self, monkeypatch): + client = self._client(monkeypatch) + assert ( + client._derive_api_url("https://example.com") + == "https://api.example.com" + ) + + def test_plain_http_host_gets_api_prefix(self, monkeypatch): + client = self._client(monkeypatch) + assert client._derive_api_url("http://example.com") == "http://api.example.com" + + def test_unknown_scheme_returned_as_is(self, monkeypatch): + client = self._client(monkeypatch) + # No console./console- prefix, no http(s) scheme → return unchanged. + assert client._derive_api_url("api.example.com") == "api.example.com" + + +@pytest.mark.asyncio +async def test_get_client_adds_api_key_header_when_no_secret(monkeypatch): + """When only an API key is configured, the public-API header is set.""" + monkeypatch.delenv("GALILEO_API_SECRET_KEY", raising=False) + monkeypatch.delenv("GALILEO_API_SECRET", raising=False) + monkeypatch.setenv("GALILEO_API_KEY", "public-key") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + client = GalileoLunaClient() + http_client = await client._get_client() + try: + assert http_client.headers.get("Galileo-API-Key") == "public-key" + finally: + await client.close() + + +@pytest.mark.asyncio +async def test_invoke_rejects_missing_scorer_identifier(monkeypatch): + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + client = GalileoLunaClient() + try: + with pytest.raises(ValueError, match="At least one scorer identifier"): + await client.invoke(input="hello") + finally: + await client.close() + + +@pytest.mark.asyncio +async def test_invoke_raises_when_response_is_not_a_json_object(monkeypatch): + """A non-object JSON body must surface as a clear RuntimeError.""" + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + client = GalileoLunaClient() + + fake_response = MagicMock() + fake_response.raise_for_status = MagicMock() + fake_response.json = MagicMock(return_value=["not", "an", "object"]) + + fake_http = AsyncMock() + fake_http.post = AsyncMock(return_value=fake_response) + fake_http.is_closed = False + client._client = fake_http + + try: + with pytest.raises(RuntimeError, match="not a JSON object"): + await client.invoke(scorer_label="toxicity", input="hello") + finally: + await client.close() + + +@pytest.mark.asyncio +async def test_invoke_propagates_http_status_error(monkeypatch): + """The client logs and re-raises HTTP status errors.""" + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + client = GalileoLunaClient() + + fake_response = MagicMock(spec=httpx.Response) + fake_response.status_code = 500 + fake_response.text = "internal error" + fake_response.raise_for_status = MagicMock( + side_effect=httpx.HTTPStatusError( + "boom", request=MagicMock(spec=httpx.Request), response=fake_response + ) + ) + + fake_http = AsyncMock() + fake_http.post = AsyncMock(return_value=fake_response) + fake_http.is_closed = False + client._client = fake_http + + try: + with pytest.raises(httpx.HTTPStatusError): + await client.invoke(scorer_label="toxicity", input="hello") + finally: + await client.close() + + +@pytest.mark.asyncio +async def test_invoke_propagates_request_error(monkeypatch): + """RequestError is logged and re-raised so callers can decide policy.""" + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + client = GalileoLunaClient() + + fake_http = AsyncMock() + fake_http.post = AsyncMock(side_effect=httpx.RequestError("network down")) + fake_http.is_closed = False + client._client = fake_http + + try: + with pytest.raises(httpx.RequestError): + await client.invoke(scorer_label="toxicity", input="hello") + finally: + await client.close() + + +@pytest.mark.asyncio +async def test_client_async_context_manager_closes_on_exit(monkeypatch): + """Entering/exiting the async context manager must close the client.""" + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + async with GalileoLunaClient() as client: + # Trigger lazy client creation so close() has work to do. + await client._get_client() + assert client._client is not None + + # __aexit__ closes the underlying httpx client. + assert client._client is None diff --git a/sdks/python/tests/test_evaluators_optional_imports.py b/sdks/python/tests/test_evaluators_optional_imports.py new file mode 100644 index 00000000..a2842a1b --- /dev/null +++ b/sdks/python/tests/test_evaluators_optional_imports.py @@ -0,0 +1,93 @@ +"""Coverage for the optional galileo import fallbacks in agent_control.evaluators. + +The galileo extras are normally installed in the dev environment, so the +``except ImportError`` branches in ``agent_control/evaluators/__init__.py`` +never fire under regular tests. This module forces those failures by hiding +the relevant modules in ``sys.modules`` and reloading the package. +""" + +from __future__ import annotations + +import builtins +import importlib +import sys + + +def _reload_evaluators_with_blocked(prefix: str) -> object: + """Reload ``agent_control.evaluators`` while ``prefix.*`` imports fail. + + Returns the freshly loaded module so callers can inspect ``__all__``. + Restores the original ``builtins.__import__`` and ``sys.modules`` entries + on the way out. + """ + original_import = builtins.__import__ + + def fail_for_prefix(name: str, *args: object, **kwargs: object) -> object: + if name == prefix or name.startswith(f"{prefix}."): + raise ImportError(f"forced failure for {name}") + return original_import(name, *args, **kwargs) # type: ignore[arg-type] + + # Drop any cached entries so the patched import is consulted. + blocked_modules = [m for m in list(sys.modules) if m == prefix or m.startswith(f"{prefix}.")] + saved_modules = {m: sys.modules.pop(m) for m in blocked_modules} + saved_evaluators = sys.modules.pop("agent_control.evaluators", None) + + builtins.__import__ = fail_for_prefix + try: + import agent_control.evaluators as reloaded + + reloaded = importlib.reload(reloaded) + return reloaded + finally: + builtins.__import__ = original_import + # Restore the cached modules so other tests keep their state. + for name, module in saved_modules.items(): + sys.modules[name] = module + if saved_evaluators is not None: + sys.modules["agent_control.evaluators"] = saved_evaluators + + +def test_module_loads_when_galileo_luna_is_unavailable(): + """Hiding ``agent_control_evaluator_galileo.luna`` exercises its except branch.""" + reloaded = _reload_evaluators_with_blocked("agent_control_evaluator_galileo.luna") + + # Core names are always present. + assert "Evaluator" in reloaded.__all__ + # Luna1 names are NOT present because the import failed. + assert "LunaEvaluator" not in reloaded.__all__ + assert "GalileoLunaClient" not in reloaded.__all__ + + +def test_module_loads_when_galileo_package_is_unavailable(): + """Hiding the whole package exercises both ImportError fallbacks at once.""" + reloaded = _reload_evaluators_with_blocked("agent_control_evaluator_galileo") + + assert "Evaluator" in reloaded.__all__ + # Both luna1 and luna2 optional names are absent. + for absent in ( + "LunaEvaluator", + "GalileoLunaClient", + "Luna2Evaluator", + "Luna2EvaluatorConfig", + "LUNA_AVAILABLE", + "LUNA2_AVAILABLE", + ): + assert absent not in reloaded.__all__ + + +def test_module_loads_galileo_optional_imports_when_available(): + """Sanity check: with galileo installed, the optional names ARE exposed. + + Reloading without patching __import__ runs both success branches. + """ + saved = sys.modules.pop("agent_control.evaluators", None) + try: + import agent_control.evaluators as reloaded + + reloaded = importlib.reload(reloaded) + # Sanity: at least one luna1 and one luna2 name should reappear. + assert "LunaEvaluator" in reloaded.__all__ + assert "Luna2Evaluator" in reloaded.__all__ + finally: + if saved is not None: + sys.modules["agent_control.evaluators"] = saved From 9a94bba0eb4500f3e816e86008446289c26fb1d1 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Sat, 16 May 2026 06:41:50 -0700 Subject: [PATCH 16/20] move coervagera --- .../python/tests/test_evaluators_optional_imports.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sdks/python/tests/test_evaluators_optional_imports.py b/sdks/python/tests/test_evaluators_optional_imports.py index a2842a1b..2bc2129e 100644 --- a/sdks/python/tests/test_evaluators_optional_imports.py +++ b/sdks/python/tests/test_evaluators_optional_imports.py @@ -10,8 +10,16 @@ import builtins import importlib +import importlib.util import sys +import pytest + +_GALILEO_INSTALLED = ( + importlib.util.find_spec("agent_control_evaluator_galileo.luna") is not None + and importlib.util.find_spec("agent_control_evaluator_galileo.luna2") is not None +) + def _reload_evaluators_with_blocked(prefix: str) -> object: """Reload ``agent_control.evaluators`` while ``prefix.*`` imports fail. @@ -75,6 +83,10 @@ def test_module_loads_when_galileo_package_is_unavailable(): assert absent not in reloaded.__all__ +@pytest.mark.skipif( + not _GALILEO_INSTALLED, + reason="agent-control-evaluator-galileo extras not installed in this environment", +) def test_module_loads_galileo_optional_imports_when_available(): """Sanity check: with galileo installed, the optional names ARE exposed. From f0d11b7fede964cab123728788ebb7ea4fc07fa7 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Sat, 16 May 2026 06:45:41 -0700 Subject: [PATCH 17/20] failing test --- .../tests/test_evaluators_optional_imports.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sdks/python/tests/test_evaluators_optional_imports.py b/sdks/python/tests/test_evaluators_optional_imports.py index 2bc2129e..735164be 100644 --- a/sdks/python/tests/test_evaluators_optional_imports.py +++ b/sdks/python/tests/test_evaluators_optional_imports.py @@ -15,10 +15,21 @@ import pytest -_GALILEO_INSTALLED = ( - importlib.util.find_spec("agent_control_evaluator_galileo.luna") is not None - and importlib.util.find_spec("agent_control_evaluator_galileo.luna2") is not None -) + +def _module_available(name: str) -> bool: + """Return whether ``name`` resolves without raising for missing parents.""" + try: + return importlib.util.find_spec(name) is not None + except (ImportError, ValueError): + # ``find_spec`` raises ModuleNotFoundError (a subclass of ImportError) + # when a *parent* package is missing, instead of returning None. Treat + # that as "not installed." + return False + + +_GALILEO_INSTALLED = _module_available( + "agent_control_evaluator_galileo.luna" +) and _module_available("agent_control_evaluator_galileo.luna2") def _reload_evaluators_with_blocked(prefix: str) -> object: From f683dda765e3623d2fbac0a6da81d9aaf3157296 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Mon, 18 May 2026 10:26:50 -0700 Subject: [PATCH 18/20] add input text that goes into controls evaluators --- engine/src/agent_control_engine/core.py | 1 + engine/tests/test_core.py | 42 ++++++++++++++++++++++ server/src/agent_control_server/migrate.py | 2 ++ server/tests/test_migrate.py | 5 +++ 4 files changed, 50 insertions(+) diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py index 99c2273b..fdb323e0 100644 --- a/engine/src/agent_control_engine/core.py +++ b/engine/src/agent_control_engine/core.py @@ -224,6 +224,7 @@ async def _evaluate_leaf( "message": self._truncated_message(result.message), } metadata = dict(result.metadata or {}) + metadata["selected_data"] = data metadata["condition_trace"] = trace return _ConditionEvaluation( result=result.model_copy(update={"metadata": metadata}), diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py index 78eda0ab..d2eb0871 100644 --- a/engine/tests/test_core.py +++ b/engine/tests/test_core.py @@ -1412,6 +1412,48 @@ async def test_or_short_circuit_records_skipped_trace(self): assert trace["children"][1]["matched"] is None assert trace["children"][1]["short_circuit_reason"] == "or_matched" + @pytest.mark.asyncio + async def test_leaf_metadata_includes_selector_selected_data(self): + """Leaf metadata should expose the value selected by selector.path.""" + # Given: a leaf control selecting a nested step input value + controls = [ + MockControlWithIdentity( + id=1, + name="city_control", + control=ControlDefinition( + description="City guardrail", + enabled=True, + execution="server", + scope={"step_types": ["tool"], "stages": ["pre"]}, + condition={ + "selector": {"path": "input.city"}, + "evaluator": {"name": "test-deny", "config": {"value": "match"}}, + }, + action={"decision": "observe"}, + ), + ) + ] + engine = ControlEngine(controls) + + # When: processing a request where input.city has a concrete value + request = EvaluationRequest( + agent_name="00000000-0000-0000-0000-000000000001", + step=Step( + type="tool", + name="lookup-weather", + input={"city": "San Francisco"}, + output=None, + ), + stage="pre", + ) + result = await engine.process(request) + + # Then: event reconstruction can use selected_data as ControlSpan.input + assert result.matches is not None + metadata = result.matches[0].result.metadata + assert metadata is not None + assert metadata["selected_data"] == "San Francisco" + @pytest.mark.asyncio async def test_composite_results_preserve_decisive_child_metadata(self): """Composite results should retain structured metadata from the decisive child.""" diff --git a/server/src/agent_control_server/migrate.py b/server/src/agent_control_server/migrate.py index 3f260d4a..48c775e0 100644 --- a/server/src/agent_control_server/migrate.py +++ b/server/src/agent_control_server/migrate.py @@ -110,6 +110,7 @@ def _acquire_migration_lock(connection: Connection, timeout_seconds: float) -> N ).scalar_one() ) if acquired: + connection.commit() LOGGER.info("Acquired Agent Control migration advisory lock.") return @@ -150,6 +151,7 @@ def _serialized_migration(cfg: Config, *, enabled: bool) -> Iterator[None]: _MIGRATION_LOCK_PARAMS, ).scalar_one() ) + connection.commit() if released: LOGGER.info("Released Agent Control migration advisory lock.") else: diff --git a/server/tests/test_migrate.py b/server/tests/test_migrate.py index eaed9798..a82e6dd4 100644 --- a/server/tests/test_migrate.py +++ b/server/tests/test_migrate.py @@ -19,6 +19,7 @@ class _FakeConnection: def __init__(self, lock_results: list[bool]) -> None: self.lock_results = lock_results self.statements: list[str] = [] + self.commits = 0 def __enter__(self) -> _FakeConnection: return self @@ -35,6 +36,9 @@ def execute(self, statement: object, params: object) -> _FakeResult: return _FakeResult(True) raise AssertionError(f"unexpected SQL statement: {statement_text}") + def commit(self) -> None: + self.commits += 1 + class _FakeEngine: def __init__(self, connection: _FakeConnection) -> None: @@ -106,6 +110,7 @@ def test_serialized_migration_acquires_and_releases_postgres_lock(monkeypatch) - "SELECT pg_try_advisory_lock(:class_id, :object_id)", "SELECT pg_advisory_unlock(:class_id, :object_id)", ] + assert connection.commits == 2 assert sleeps == [2.0] assert engine.disposed From 7210fc10872902083b93ce1e0c340eaf52e580e9 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Mon, 18 May 2026 10:59:05 -0700 Subject: [PATCH 19/20] add docstring --- engine/src/agent_control_engine/core.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py index fdb323e0..2b2404a9 100644 --- a/engine/src/agent_control_engine/core.py +++ b/engine/src/agent_control_engine/core.py @@ -270,7 +270,19 @@ def _composite_metadata( *, matched: bool, ) -> dict[str, Any] | None: - """Select stable child metadata to preserve on composite results.""" + """Select stable child metadata to preserve on composite results. + + The selected_data value in this metadata is not all evaluator inputs. + It is the selected value from the leaf metadata the engine preserves for + the final composite result: + - or where one child matches: selected_data comes from the matching child. + - and where one child fails: selected_data comes from the failing child. + - and where all children match: selected_data comes from the first + matching child, usually the first leaf. + - or where no children match: selected_data comes from the first + evaluated child. + - not: selected_data comes from its child. + """ source_result: EvaluatorResult | None = None if matched: source_result = next( From f513dac2d90a70f2ba653cd267ace4947f40278b Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Fri, 22 May 2026 12:19:00 -0700 Subject: [PATCH 20/20] address comments --- engine/src/agent_control_engine/core.py | 135 ++++++++++++++++-- engine/tests/test_core.py | 115 ++++++++++++++- .../luna/client.py | 76 ++++++++-- .../luna/config.py | 9 ++ .../luna/evaluator.py | 16 +-- .../galileo/tests/test_luna_coverage_gaps.py | 122 ++++++++++++++-- .../galileo/tests/test_luna_evaluator.py | 3 + examples/galileo_luna/README.md | 23 ++- examples/galileo_luna/demo_agent.py | 30 +++- examples/galileo_luna/setup_controls.py | 37 +++-- .../src/agent_control/evaluation_events.py | 15 +- sdks/python/src/agent_control/otel_sink.py | 10 ++ .../tests/test_observability_updates.py | 72 +++++++++- sdks/python/tests/test_otel_sink.py | 22 ++- 14 files changed, 616 insertions(+), 69 deletions(-) diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py index 2b2404a9..e2ae8b6e 100644 --- a/engine/src/agent_control_engine/core.py +++ b/engine/src/agent_control_engine/core.py @@ -33,6 +33,108 @@ # Max concurrent evaluations (limits task spawning overhead for large policies) MAX_CONCURRENT_EVALUATIONS = int(os.environ.get("MAX_CONCURRENT_EVALUATIONS", "3")) +SELECTED_DATA_PREVIEW_MAX_CHARS = int( + os.environ.get("AGENT_CONTROL_SELECTED_DATA_PREVIEW_MAX_CHARS", "500") +) +SELECTED_DATA_PREVIEW_MAX_ITEMS = int( + os.environ.get("AGENT_CONTROL_SELECTED_DATA_PREVIEW_MAX_ITEMS", "20") +) +SELECTED_DATA_PREVIEW_MAX_DEPTH = int( + os.environ.get("AGENT_CONTROL_SELECTED_DATA_PREVIEW_MAX_DEPTH", "3") +) +_SENSITIVE_KEY_PARTS = ( + "api_key", + "apikey", + "authorization", + "credential", + "password", + "secret", + "token", +) + + +def _env_flag(name: str, *, default: bool = False) -> bool: + """Read a boolean environment flag.""" + value = os.environ.get(name) + if value is None: + return default + return value.strip().lower() in {"1", "true", "yes", "on"} + + +def _is_sensitive_key(key: object) -> bool: + """Return whether a mapping key is likely to contain a secret.""" + normalized = str(key).lower() + return any(part in normalized for part in _SENSITIVE_KEY_PARTS) + + +def _truncate_string(value: str, max_chars: int) -> tuple[str, bool]: + """Return a bounded string preview and whether it was truncated.""" + if len(value) <= max_chars: + return value, False + if max_chars <= 3: + return value[:max_chars], True + return f"{value[: max_chars - 3]}...", True + + +def _selected_data_preview_value( + value: Any, + *, + depth: int = 0, +) -> tuple[Any, bool]: + """Build a bounded, redacted preview of selected data.""" + if depth >= SELECTED_DATA_PREVIEW_MAX_DEPTH: + return "", True + + if value is None or isinstance(value, bool | int | float): + return value, False + + if isinstance(value, str): + return _truncate_string(value, SELECTED_DATA_PREVIEW_MAX_CHARS) + + if isinstance(value, dict): + preview: dict[str, Any] = {} + truncated = len(value) > SELECTED_DATA_PREVIEW_MAX_ITEMS + for index, (key, item) in enumerate(value.items()): + if index >= SELECTED_DATA_PREVIEW_MAX_ITEMS: + break + preview_key = str(key) + if _is_sensitive_key(key): + preview[preview_key] = "" + truncated = True + continue + preview_item, item_truncated = _selected_data_preview_value( + item, + depth=depth + 1, + ) + preview[preview_key] = preview_item + truncated = truncated or item_truncated + return preview, truncated + + if isinstance(value, list | tuple): + preview_items: list[Any] = [] + truncated = len(value) > SELECTED_DATA_PREVIEW_MAX_ITEMS + for item in value[:SELECTED_DATA_PREVIEW_MAX_ITEMS]: + preview_item, item_truncated = _selected_data_preview_value( + item, + depth=depth + 1, + ) + preview_items.append(preview_item) + truncated = truncated or item_truncated + return preview_items, truncated + + text_preview, truncated = _truncate_string(str(value), SELECTED_DATA_PREVIEW_MAX_CHARS) + return text_preview, truncated + + +def _selected_data_preview(value: Any) -> dict[str, Any]: + """Return UI-safe selector output details for evaluator-level inspection.""" + preview, truncated = _selected_data_preview_value(value) + return { + "type": type(value).__name__, + "value": preview, + "truncated": truncated, + } + @functools.lru_cache(maxsize=256) def _compile_regex(pattern: str) -> Any: @@ -102,9 +204,16 @@ def __init__( self, controls: Sequence[ControlWithIdentity], context: Literal["sdk", "server"] = "server", + *, + include_raw_selected_data: bool | None = None, ): self.controls = controls self.context = context + self.include_raw_selected_data = ( + _env_flag("AGENT_CONTROL_INCLUDE_RAW_SELECTED_DATA") + if include_raw_selected_data is None + else include_raw_selected_data + ) @staticmethod def _truncated_message(message: str | None) -> str | None: @@ -224,7 +333,9 @@ async def _evaluate_leaf( "message": self._truncated_message(result.message), } metadata = dict(result.metadata or {}) - metadata["selected_data"] = data + if self.include_raw_selected_data: + metadata["engine_selected_data"] = data + metadata["engine_selected_data_preview"] = _selected_data_preview(data) metadata["condition_trace"] = trace return _ConditionEvaluation( result=result.model_copy(update={"metadata": metadata}), @@ -272,16 +383,18 @@ def _composite_metadata( ) -> dict[str, Any] | None: """Select stable child metadata to preserve on composite results. - The selected_data value in this metadata is not all evaluator inputs. - It is the selected value from the leaf metadata the engine preserves for - the final composite result: - - or where one child matches: selected_data comes from the matching child. - - and where one child fails: selected_data comes from the failing child. - - and where all children match: selected_data comes from the first - matching child, usually the first leaf. - - or where no children match: selected_data comes from the first - evaluated child. - - not: selected_data comes from its child. + The engine_selected_data_preview value in this metadata is not all + evaluator inputs. It is the bounded selected value preview from the leaf + metadata the engine preserves for the final composite result: + - or where one child matches: engine_selected_data_preview comes from the + matching child. + - and where one child fails: engine_selected_data_preview comes from the + failing child. + - and where all children match: engine_selected_data_preview comes from the + first matching child, usually the first leaf. + - or where no children match: engine_selected_data_preview comes from the + first evaluated child. + - not: engine_selected_data_preview comes from its child. """ source_result: EvaluatorResult | None = None if matched: diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py index d2eb0871..ed4e6e00 100644 --- a/engine/tests/test_core.py +++ b/engine/tests/test_core.py @@ -157,7 +157,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: matched=matched, confidence=0.8 if matched else 0.4, message=f"Metadata {self.config.value}", - metadata={"source": self.config.value, "selected_data": data}, + metadata={"source": self.config.value, "selected_data": f"evaluator:{data}"}, ) _execution_log.append(f"metadata:{self.config.value}:end") return result @@ -1413,8 +1413,8 @@ async def test_or_short_circuit_records_skipped_trace(self): assert trace["children"][1]["short_circuit_reason"] == "or_matched" @pytest.mark.asyncio - async def test_leaf_metadata_includes_selector_selected_data(self): - """Leaf metadata should expose the value selected by selector.path.""" + async def test_leaf_metadata_includes_selector_selected_data_preview(self): + """Leaf metadata should expose a safe preview of the selected selector.path value.""" # Given: a leaf control selecting a nested step input value controls = [ MockControlWithIdentity( @@ -1448,11 +1448,109 @@ async def test_leaf_metadata_includes_selector_selected_data(self): ) result = await engine.process(request) - # Then: event reconstruction can use selected_data as ControlSpan.input + # Then: UI consumers can inspect the selected value without raw data export. assert result.matches is not None metadata = result.matches[0].result.metadata assert metadata is not None - assert metadata["selected_data"] == "San Francisco" + assert "selected_data" not in metadata + assert metadata["engine_selected_data_preview"] == { + "type": "str", + "value": "San Francisco", + "truncated": False, + } + + @pytest.mark.asyncio + async def test_leaf_selected_data_preview_is_bounded_and_redacted(self): + """Selected data previews should cap payload size and redact secret-like keys.""" + # Given: a leaf control selecting a large object with a secret-like key + controls = [ + MockControlWithIdentity( + id=1, + name="payload_control", + control=ControlDefinition( + description="Payload guardrail", + enabled=True, + execution="server", + scope={"step_types": ["tool"], "stages": ["pre"]}, + condition={ + "selector": {"path": "input"}, + "evaluator": {"name": "test-deny", "config": {"value": "match"}}, + }, + action={"decision": "observe"}, + ), + ) + ] + engine = ControlEngine(controls) + request = EvaluationRequest( + agent_name="00000000-0000-0000-0000-000000000001", + step=Step( + type="tool", + name="send-payload", + input={ + "prompt": "x" * 600, + "api_key": "secret-value", + }, + output=None, + ), + stage="pre", + ) + + # When: processing the request + result = await engine.process(request) + + # Then: the preview is useful for UI inspection but does not expose the raw payload. + assert result.matches is not None + metadata = result.matches[0].result.metadata + assert metadata is not None + preview = metadata["engine_selected_data_preview"] + assert preview["type"] == "dict" + assert preview["truncated"] is True + assert preview["value"]["api_key"] == "" + assert preview["value"]["prompt"].endswith("...") + assert len(preview["value"]["prompt"]) == 500 + + @pytest.mark.asyncio + async def test_engine_selected_data_does_not_overwrite_evaluator_metadata(self): + """Engine-owned selector data should not collide with evaluator-owned metadata.""" + # Given: an evaluator that deliberately returns its own selected_data key + controls = [ + MockControlWithIdentity( + id=1, + name="metadata_control", + control=ControlDefinition( + description="Metadata guardrail", + enabled=True, + execution="server", + scope={"step_types": ["llm"], "stages": ["pre"]}, + condition={ + "selector": {"path": "input"}, + "evaluator": {"name": "test-metadata", "config": {"value": "match"}}, + }, + action={"decision": "observe"}, + ), + ) + ] + engine = ControlEngine(controls, include_raw_selected_data=True) + request = EvaluationRequest( + agent_name="00000000-0000-0000-0000-000000000001", + step=Step(type="llm", name="test-step", input="raw input", output=None), + stage="pre", + ) + + # When: processing the request + result = await engine.process(request) + + # Then: evaluator-owned metadata remains intact and engine-owned data is namespaced. + assert result.matches is not None + metadata = result.matches[0].result.metadata + assert metadata is not None + assert metadata["selected_data"] == "evaluator:raw input" + assert metadata["engine_selected_data"] == "raw input" + assert metadata["engine_selected_data_preview"] == { + "type": "str", + "value": "raw input", + "truncated": False, + } @pytest.mark.asyncio async def test_composite_results_preserve_decisive_child_metadata(self): @@ -1511,7 +1609,12 @@ async def test_composite_results_preserve_decisive_child_metadata(self): metadata = result.matches[0].result.metadata assert metadata is not None assert metadata["source"] == "match-right" - assert metadata["selected_data"] == "chosen" + assert metadata["selected_data"] == "evaluator:chosen" + assert metadata["engine_selected_data_preview"] == { + "type": "str", + "value": "chosen", + "truncated": False, + } assert metadata["condition_trace"]["type"] == "or" assert "slow:skip-tail:start" not in _execution_log diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py index 51d34c96..3bbc807f 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py @@ -9,6 +9,7 @@ from hmac import new as hmac_new from json import dumps from time import time +from typing import Literal import httpx from agent_control_models import JSONObject, JSONValue @@ -20,6 +21,7 @@ DEFAULT_INTERNAL_TOKEN_TTL_SECS = 3600 PUBLIC_SCORER_INVOKE_PATH = "/scorers/invoke" INTERNAL_SCORER_INVOKE_PATH = "/internal/scorers/invoke" +AuthMode = Literal["public", "internal"] def _b64url(data: bytes) -> str: @@ -49,6 +51,18 @@ def _internal_auth_token( return f"{signing_input}.{_b64url(signature)}" +def _env_auth_mode() -> AuthMode | None: + value = os.getenv("GALILEO_LUNA_AUTH_MODE") + if value is None or value.strip() == "": + return None + normalized = value.strip().lower() + if normalized == "public": + return "public" + if normalized == "internal": + return "internal" + raise ValueError("GALILEO_LUNA_AUTH_MODE must be either 'public' or 'internal'.") + + def _as_float_or_none(value: JSONValue) -> float | None: if isinstance(value, bool) or value is None: return None @@ -151,6 +165,7 @@ class GalileoLunaClient: Environment Variables: GALILEO_API_SECRET_KEY or GALILEO_API_SECRET: Galileo API internal JWT signing secret. GALILEO_API_KEY: Galileo API key fallback for public scorer invocation. + GALILEO_LUNA_AUTH_MODE: Auth mode, either "public" or "internal". GALILEO_CONSOLE_URL: Galileo Console URL (optional, defaults to production). """ @@ -160,6 +175,7 @@ def __init__( api_secret: str | None = None, console_url: str | None = None, api_url: str | None = None, + auth_mode: AuthMode | None = None, ) -> None: """Initialize the Galileo Luna client. @@ -171,22 +187,26 @@ def __init__( GALILEO_CONSOLE_URL or uses the production console URL. api_url: Galileo API URL. If not provided, reads from GALILEO_API_URL before deriving from the console URL. + auth_mode: Auth mode to use. If not provided, reads from + GALILEO_LUNA_AUTH_MODE, or infers from the single available credential. Raises: - ValueError: If neither API secret nor API key is provided. + ValueError: If credentials are missing, ambiguous, or incompatible with + the selected auth mode. """ resolved_api_secret = ( api_secret or os.getenv("GALILEO_API_SECRET_KEY") or os.getenv("GALILEO_API_SECRET") ) resolved_api_key = api_key or os.getenv("GALILEO_API_KEY") - if not resolved_api_secret and not resolved_api_key: - raise ValueError( - "GALILEO_API_SECRET_KEY or GALILEO_API_KEY is required. " - "Set one as an environment variable or pass it to the constructor." - ) + resolved_auth_mode = self._resolve_auth_mode( + auth_mode or _env_auth_mode(), + api_key=resolved_api_key, + api_secret=resolved_api_secret, + ) self.api_key = resolved_api_key self.api_secret = resolved_api_secret + self.auth_mode = resolved_auth_mode self.console_url = ( console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai" ) @@ -194,6 +214,44 @@ def __init__( "/" ) or self._derive_api_url(self.console_url) self._client: httpx.AsyncClient | None = None + logger.info("[GalileoLunaClient] Auth mode selected: %s", self.auth_mode) + + @staticmethod + def _resolve_auth_mode( + auth_mode: AuthMode | None, + *, + api_key: str | None, + api_secret: str | None, + ) -> AuthMode: + if auth_mode == "public": + if not api_key: + raise ValueError( + "GALILEO_API_KEY is required when GALILEO_LUNA_AUTH_MODE=public." + ) + return "public" + + if auth_mode == "internal": + if not api_secret: + raise ValueError( + "GALILEO_API_SECRET_KEY or GALILEO_API_SECRET is required when " + "GALILEO_LUNA_AUTH_MODE=internal." + ) + return "internal" + + if api_key and api_secret: + raise ValueError( + "Both Galileo API key and API secret are configured. Set " + "GALILEO_LUNA_AUTH_MODE to 'public' or 'internal' to choose the " + "runtime auth mode explicitly." + ) + if api_secret: + return "internal" + if api_key: + return "public" + raise ValueError( + "GALILEO_API_SECRET_KEY or GALILEO_API_KEY is required. " + "Set one as an environment variable or pass it to the constructor." + ) def _derive_api_url(self, console_url: str) -> str: """Derive the API URL from a Galileo Console URL.""" @@ -215,7 +273,7 @@ async def _get_client(self) -> httpx.AsyncClient: """Get or create the HTTP client.""" if self._client is None or self._client.is_closed: headers = {"Content-Type": "application/json"} - if self.api_secret is None and self.api_key is not None: + if self.auth_mode == "public" and self.api_key is not None: headers["Galileo-API-Key"] = self.api_key self._client = httpx.AsyncClient( headers=headers, @@ -228,9 +286,11 @@ def _endpoint_and_headers( headers: dict[str, str] | None, ) -> tuple[str, dict[str, str]]: request_headers = dict(headers or {}) - if self.api_secret is None: + if self.auth_mode == "public": return f"{self.api_base}{PUBLIC_SCORER_INVOKE_PATH}", request_headers + if self.api_secret is None: + raise RuntimeError("Internal Luna auth mode is missing an API secret.") request_headers["Authorization"] = f"Bearer {_internal_auth_token(self.api_secret)}" return f"{self.api_base}{INTERNAL_SCORER_INVOKE_PATH}", request_headers diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py index c49dd716..788fa24c 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py @@ -9,6 +9,7 @@ from pydantic import Field, model_validator LunaOperator = Literal["gt", "gte", "lt", "lte", "eq", "ne", "contains", "any"] +LunaPayloadField = Literal["input", "output"] _NUMERIC_OPERATORS = frozenset({"gt", "gte", "lt", "lte"}) @@ -37,6 +38,7 @@ class LunaEvaluatorConfig(EvaluatorConfig): threshold: Local threshold used by the evaluator for comparison. operator: Local comparison operator. Numeric operators use threshold as a number. scorer_config: Optional scorer-specific config sent as ``config``. + payload_field: Explicit scorer input side for scalar selected data. timeout_ms: Request timeout in milliseconds. """ @@ -69,6 +71,13 @@ class LunaEvaluatorConfig(EvaluatorConfig): serialization_alias="config", description="Optional scorer-specific configuration sent to Galileo.", ) + payload_field: LunaPayloadField = Field( + default="input", + description=( + "Which scorer input side to use when selector output is a scalar value. " + "Structured selected data with input/output keys overrides this setting." + ), + ) timeout_ms: int = Field( default=10000, ge=1000, diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py index ce46cf44..7b48052f 100644 --- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py +++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py @@ -61,8 +61,6 @@ def _contains(score: JSONValue, threshold: JSONValue) -> bool: if isinstance(score, list): return threshold in score if isinstance(score, dict): - if isinstance(threshold, str) and threshold in score: - return True return threshold in score.values() return False @@ -116,12 +114,10 @@ def __init__(self, config: LunaEvaluatorConfig) -> None: ) super().__init__(config) - self._client: GalileoLunaClient | None = None + self._client = GalileoLunaClient() def _get_client(self) -> GalileoLunaClient: - """Get or create the Galileo Luna client.""" - if self._client is None: - self._client = GalileoLunaClient() + """Get the Galileo Luna client.""" return self._client def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]: @@ -133,8 +129,7 @@ def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]: return input_text, output_text text = _coerce_payload_text(data) - scorer_label = self.config.scorer_label or "" - if "output" in scorer_label: + if self.config.payload_field == "output": return None, text return text, None @@ -262,7 +257,6 @@ def _handle_error( confidence=0.0, message=f"Luna evaluation error: {error_detail}", metadata={ - "error": error_detail, "error_type": type(error).__name__, "scorer_label": self.config.scorer_label, "scorer_id": self.config.scorer_id, @@ -273,6 +267,4 @@ def _handle_error( async def aclose(self) -> None: """Close the underlying Galileo Luna client.""" - if self._client is not None: - await self._client.close() - self._client = None + await self._client.close() diff --git a/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py b/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py index f4d0e360..68755c99 100644 --- a/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py +++ b/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py @@ -76,7 +76,7 @@ def test_present_key_coerced(self): class TestContains: - """``_contains`` supports str/list/dict scores against a threshold.""" + """``_contains`` supports str/list and dict values against a threshold.""" def test_none_threshold_is_no_match(self): from agent_control_evaluator_galileo.luna.evaluator import _contains @@ -95,10 +95,10 @@ def test_list_contains_value(self): assert _contains(["a", "b", "c"], "b") is True assert _contains(["a", "b", "c"], "z") is False - def test_dict_threshold_matches_key(self): + def test_dict_threshold_does_not_match_key(self): from agent_control_evaluator_galileo.luna.evaluator import _contains - assert _contains({"toxicity": 0.9}, "toxicity") is True + assert _contains({"toxicity": 0.9}, "toxicity") is False def test_dict_threshold_matches_value(self): from agent_control_evaluator_galileo.luna.evaluator import _contains @@ -216,7 +216,7 @@ def test_numeric_operator_rejects_non_numeric_score(self, monkeypatch): class TestPreparePayload: - """``_prepare_payload`` routes scalar data based on the scorer label.""" + """``_prepare_payload`` routes scalar data using explicit config.""" def test_scalar_routed_to_input_when_label_lacks_output(self, monkeypatch): monkeypatch.setenv("GALILEO_API_KEY", "test-key") @@ -229,12 +229,16 @@ def test_scalar_routed_to_input_when_label_lacks_output(self, monkeypatch): assert input_text == "hello" assert output_text is None - def test_scalar_routed_to_output_when_label_contains_output(self, monkeypatch): + def test_scalar_routed_to_output_when_payload_field_is_output(self, monkeypatch): monkeypatch.setenv("GALILEO_API_KEY", "test-key") from agent_control_evaluator_galileo.luna import LunaEvaluator evaluator = LunaEvaluator.from_dict( - {"scorer_label": "output_correctness", "threshold": 0.5} + { + "scorer_label": "toxicity", + "threshold": 0.5, + "payload_field": "output", + } ) input_text, output_text = evaluator._prepare_payload("hello") @@ -242,10 +246,45 @@ def test_scalar_routed_to_output_when_label_contains_output(self, monkeypatch): assert input_text is None assert output_text == "hello" + def test_scalar_output_label_without_payload_field_still_defaults_to_input( + self, + monkeypatch, + ): + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna import LunaEvaluator + + evaluator = LunaEvaluator.from_dict( + {"scorer_label": "output_correctness", "threshold": 0.5} + ) + + input_text, output_text = evaluator._prepare_payload("hello") + + assert input_text == "hello" + assert output_text is None + + def test_structured_payload_uses_input_output_keys_over_payload_field(self, monkeypatch): + monkeypatch.setenv("GALILEO_API_KEY", "test-key") + from agent_control_evaluator_galileo.luna import LunaEvaluator + + evaluator = LunaEvaluator.from_dict( + { + "scorer_label": "toxicity", + "threshold": 0.5, + "payload_field": "output", + } + ) + + input_text, output_text = evaluator._prepare_payload( + {"input": "prompt", "output": "answer"} + ) + + assert input_text == "prompt" + assert output_text == "answer" + @pytest.mark.asyncio async def test_evaluator_aclose_closes_underlying_client(monkeypatch): - """``aclose`` must release the HTTP client when one was created.""" + """``aclose`` must release the eagerly-created client without clearing it.""" monkeypatch.setenv("GALILEO_API_KEY", "test-key") from agent_control_evaluator_galileo.luna import LunaEvaluator @@ -258,7 +297,7 @@ async def test_evaluator_aclose_closes_underlying_client(monkeypatch): await evaluator.aclose() fake.close.assert_awaited_once() - assert evaluator._client is None + assert evaluator._client is fake @pytest.mark.asyncio @@ -402,6 +441,7 @@ def test_client_raises_when_no_credentials(monkeypatch): "GALILEO_API_SECRET_KEY", "GALILEO_API_SECRET", "GALILEO_API_KEY", + "GALILEO_LUNA_AUTH_MODE", ): monkeypatch.delenv(name, raising=False) from agent_control_evaluator_galileo.luna.client import GalileoLunaClient @@ -410,10 +450,76 @@ def test_client_raises_when_no_credentials(monkeypatch): GalileoLunaClient() +def test_client_requires_explicit_mode_when_both_credentials_are_present(monkeypatch): + """A mixed credential environment must not silently choose an auth route.""" + monkeypatch.setenv("GALILEO_API_KEY", "public-key") + monkeypatch.setenv("GALILEO_API_SECRET_KEY", "internal-secret") + monkeypatch.delenv("GALILEO_LUNA_AUTH_MODE", raising=False) + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + with pytest.raises(ValueError, match="Both Galileo API key and API secret"): + GalileoLunaClient() + + +def test_client_uses_explicit_public_mode_when_both_credentials_are_present(monkeypatch): + """Explicit public mode should use the API-key route even if a secret is also set.""" + monkeypatch.setenv("GALILEO_API_KEY", "public-key") + monkeypatch.setenv("GALILEO_API_SECRET_KEY", "internal-secret") + monkeypatch.setenv("GALILEO_LUNA_AUTH_MODE", "public") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + client = GalileoLunaClient() + + assert client.auth_mode == "public" + endpoint, request_headers = client._endpoint_and_headers(None) + assert endpoint.endswith("/scorers/invoke") + assert "Authorization" not in request_headers + + +def test_client_uses_explicit_internal_mode_when_both_credentials_are_present(monkeypatch): + """Explicit internal mode should use the internal JWT route.""" + monkeypatch.setenv("GALILEO_API_KEY", "public-key") + monkeypatch.setenv("GALILEO_API_SECRET_KEY", "internal-secret") + monkeypatch.setenv("GALILEO_LUNA_AUTH_MODE", "internal") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + client = GalileoLunaClient() + + assert client.auth_mode == "internal" + endpoint, request_headers = client._endpoint_and_headers(None) + assert endpoint.endswith("/internal/scorers/invoke") + assert request_headers["Authorization"].startswith("Bearer ") + + +def test_client_rejects_mode_without_matching_credential(monkeypatch): + """The selected mode must have its matching credential configured.""" + monkeypatch.delenv("GALILEO_API_SECRET_KEY", raising=False) + monkeypatch.delenv("GALILEO_API_SECRET", raising=False) + monkeypatch.setenv("GALILEO_API_KEY", "public-key") + monkeypatch.setenv("GALILEO_LUNA_AUTH_MODE", "internal") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY"): + GalileoLunaClient() + + +def test_client_rejects_invalid_auth_mode(monkeypatch): + """Invalid auth mode values should fail during client initialization.""" + monkeypatch.setenv("GALILEO_API_KEY", "public-key") + monkeypatch.setenv("GALILEO_LUNA_AUTH_MODE", "sideways") + from agent_control_evaluator_galileo.luna.client import GalileoLunaClient + + with pytest.raises(ValueError, match="GALILEO_LUNA_AUTH_MODE"): + GalileoLunaClient() + + class TestDeriveApiUrl: """URL derivation covers every console.* → api.* substitution branch.""" def _client(self, monkeypatch): + monkeypatch.delenv("GALILEO_API_SECRET_KEY", raising=False) + monkeypatch.delenv("GALILEO_API_SECRET", raising=False) + monkeypatch.delenv("GALILEO_LUNA_AUTH_MODE", raising=False) monkeypatch.setenv("GALILEO_API_KEY", "test-key") from agent_control_evaluator_galileo.luna.client import GalileoLunaClient diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py index 1b7a6e94..e0cd2051 100644 --- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py +++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py @@ -42,6 +42,7 @@ def test_config_accepts_direct_scorer_fields(self) -> None: assert config.threshold == 0.7 assert config.operator == "gte" assert config.scorer_config == {"temperature": 0} + assert config.payload_field == "input" def test_config_accepts_scorer_id_without_label(self) -> None: from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig @@ -475,4 +476,6 @@ async def test_evaluator_fail_open_sets_error(self) -> None: assert result.matched is False assert result.error == "service unavailable" assert result.metadata is not None + assert "error" not in result.metadata + assert result.metadata["error_type"] == "RuntimeError" assert "fallback_action" not in result.metadata diff --git a/examples/galileo_luna/README.md b/examples/galileo_luna/README.md index 534ef640..5ac97cda 100644 --- a/examples/galileo_luna/README.md +++ b/examples/galileo_luna/README.md @@ -17,26 +17,43 @@ Start the Agent Control server from the repo root: make server-run ``` -Configure Galileo: +Configure Galileo public API-key auth: ```bash +export GALILEO_LUNA_AUTH_MODE="public" export GALILEO_API_KEY="your-api-key" export GALILEO_CONSOLE_URL="https://console.demo-v2.galileocloud.io" ``` -If the scorer requires explicit project resolution, set: +For internal deployments, use internal auth instead: ```bash -export GALILEO_PROJECT_ID="00000000-0000-0000-0000-000000000000" +export GALILEO_LUNA_AUTH_MODE="internal" +export GALILEO_API_SECRET_KEY="your-api-secret" +export GALILEO_API_URL="https://api.default.svc.cluster.local:8088" ``` Optional scorer settings: ```bash export GALILEO_LUNA_SCORER_LABEL="toxicity" +# Or select by scorer id/version instead of label: +# export GALILEO_LUNA_SCORER_ID="scorer-id" +# export GALILEO_LUNA_SCORER_VERSION_ID="scorer-version-id" export GALILEO_LUNA_THRESHOLD="0.5" +export GALILEO_LUNA_PAYLOAD_FIELD="output" ``` +`GALILEO_LUNA_PAYLOAD_FIELD` is explicit for scalar selected data. This example +selects the agent's drafted reply with `selector.path="output"`, so it sends that +scalar as the scorer `output` field. If a selector returns structured data with +`input` and/or `output` keys, those keys are sent directly and override +`GALILEO_LUNA_PAYLOAD_FIELD`. + +If both `GALILEO_API_KEY` and `GALILEO_API_SECRET_KEY`/`GALILEO_API_SECRET` are +set, `GALILEO_LUNA_AUTH_MODE` is required so the client does not silently choose +an auth path. + Run: ```bash diff --git a/examples/galileo_luna/demo_agent.py b/examples/galileo_luna/demo_agent.py index 878023cf..8c7f59b2 100644 --- a/examples/galileo_luna/demo_agent.py +++ b/examples/galileo_luna/demo_agent.py @@ -4,7 +4,7 @@ Prerequisites: 1. Start server: make server-run 2. Create controls: uv run python setup_controls.py - 3. Set GALILEO_API_KEY where this script runs + 3. Set Galileo credentials where this script runs Usage: uv run python demo_agent.py @@ -21,6 +21,7 @@ AGENT_NAME = "galileo-luna-agent" SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000") +LUNA_AUTH_MODE = os.getenv("GALILEO_LUNA_AUTH_MODE") logging.basicConfig( level=logging.INFO, @@ -90,9 +91,29 @@ def init_agent() -> None: async def run_demo() -> None: """Run scripted scenarios.""" - if not os.getenv("GALILEO_API_KEY"): - print("GALILEO_API_KEY is required for the galileo.luna evaluator.") - print("Set it before running this demo.") + api_key = os.getenv("GALILEO_API_KEY") + api_secret = os.getenv("GALILEO_API_SECRET_KEY") or os.getenv("GALILEO_API_SECRET") + if not api_key and not api_secret: + print( + "Galileo credentials are required for the galileo.luna evaluator. " + "Set GALILEO_API_KEY for public mode or GALILEO_API_SECRET_KEY for " + "internal mode." + ) + return + if api_key and api_secret and LUNA_AUTH_MODE not in {"public", "internal"}: + print( + "Both GALILEO_API_KEY and GALILEO_API_SECRET_KEY/GALILEO_API_SECRET are set. " + "Set GALILEO_LUNA_AUTH_MODE to 'public' or 'internal'." + ) + return + if LUNA_AUTH_MODE == "public" and not api_key: + print("GALILEO_API_KEY is required when GALILEO_LUNA_AUTH_MODE=public.") + return + if LUNA_AUTH_MODE == "internal" and not api_secret: + print( + "GALILEO_API_SECRET_KEY or GALILEO_API_SECRET is required when " + "GALILEO_LUNA_AUTH_MODE=internal." + ) return print("=" * 72) @@ -100,6 +121,7 @@ async def run_demo() -> None: print("=" * 72) print(f"Server: {SERVER_URL}") print(f"Agent: {AGENT_NAME}") + print(f"Auth: GALILEO_LUNA_AUTH_MODE={LUNA_AUTH_MODE or '(auto if one credential)'}") print() init_agent() diff --git a/examples/galileo_luna/setup_controls.py b/examples/galileo_luna/setup_controls.py index 69a36ad5..fb4c6c76 100644 --- a/examples/galileo_luna/setup_controls.py +++ b/examples/galileo_luna/setup_controls.py @@ -3,8 +3,9 @@ Prerequisites: - Agent Control server running at AGENT_CONTROL_URL, default http://localhost:8000 - - GALILEO_API_KEY set where demo_agent.py will run - - Optional GALILEO_PROJECT_ID for project-scoped scorer resolution + - Galileo credentials set where demo_agent.py will run: + GALILEO_API_KEY with GALILEO_LUNA_AUTH_MODE=public, or + GALILEO_API_SECRET_KEY/GALILEO_API_SECRET with GALILEO_LUNA_AUTH_MODE=internal Usage: uv run python setup_controls.py @@ -24,8 +25,14 @@ SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000") LUNA_SCORER_LABEL = os.getenv("GALILEO_LUNA_SCORER_LABEL", "toxicity") +LUNA_SCORER_ID = os.getenv("GALILEO_LUNA_SCORER_ID") +LUNA_SCORER_VERSION_ID = os.getenv("GALILEO_LUNA_SCORER_VERSION_ID") LUNA_THRESHOLD = float(os.getenv("GALILEO_LUNA_THRESHOLD", "0.5")) -GALILEO_PROJECT_ID = os.getenv("GALILEO_PROJECT_ID") +LUNA_PAYLOAD_FIELD = os.getenv("GALILEO_LUNA_PAYLOAD_FIELD", "output") +LUNA_AUTH_MODE = os.getenv("GALILEO_LUNA_AUTH_MODE") + +if LUNA_PAYLOAD_FIELD not in {"input", "output"}: + raise ValueError("GALILEO_LUNA_PAYLOAD_FIELD must be either 'input' or 'output'.") DEMO_STEPS = [ { @@ -41,14 +48,16 @@ def luna_config() -> dict[str, Any]: """Build the direct Luna evaluator config used by the composite control.""" config: dict[str, Any] = { - "scorer_label": LUNA_SCORER_LABEL, "threshold": LUNA_THRESHOLD, "operator": "gte", - "payload_field": "output", - "on_error": "allow", + "payload_field": LUNA_PAYLOAD_FIELD, } - if GALILEO_PROJECT_ID: - config["project_id"] = GALILEO_PROJECT_ID + if LUNA_SCORER_LABEL: + config["scorer_label"] = LUNA_SCORER_LABEL + if LUNA_SCORER_ID: + config["scorer_id"] = LUNA_SCORER_ID + if LUNA_SCORER_VERSION_ID: + config["scorer_version_id"] = LUNA_SCORER_VERSION_ID return config @@ -158,9 +167,15 @@ async def setup_demo() -> None: print("Setting up direct Galileo Luna demo controls") print(f"Server: {SERVER_URL}") print(f"Agent: {AGENT_NAME}") - print(f"Luna: scorer_label={LUNA_SCORER_LABEL!r}, threshold={LUNA_THRESHOLD}") - if GALILEO_PROJECT_ID: - print(f"Project ID: {GALILEO_PROJECT_ID}") + print( + "Luna: " + f"scorer_label={LUNA_SCORER_LABEL!r}, " + f"scorer_id={LUNA_SCORER_ID!r}, " + f"scorer_version_id={LUNA_SCORER_VERSION_ID!r}, " + f"threshold={LUNA_THRESHOLD}, " + f"payload_field={LUNA_PAYLOAD_FIELD!r}" + ) + print(f"Auth: GALILEO_LUNA_AUTH_MODE={LUNA_AUTH_MODE or '(auto if one credential)'}") async with AgentControlClient(base_url=SERVER_URL, timeout=30.0) as client: await client.health_check() diff --git a/sdks/python/src/agent_control/evaluation_events.py b/sdks/python/src/agent_control/evaluation_events.py index 0efe6e86..a0b37a03 100644 --- a/sdks/python/src/agent_control/evaluation_events.py +++ b/sdks/python/src/agent_control/evaluation_events.py @@ -22,6 +22,19 @@ _FALLBACK_TRACE_ID = "0" * 32 _FALLBACK_SPAN_ID = "0" * 16 _trace_warning_logged = False +_DEBUG_METADATA_KEYS = frozenset( + { + "selected_data", + "selected_data_preview", + "engine_selected_data", + "engine_selected_data_preview", + } +) + + +def _safe_event_metadata(metadata: dict[str, object]) -> dict[str, object]: + """Drop raw/debug metadata that should not be exported as observability data.""" + return {key: value for key, value in metadata.items() if key not in _DEBUG_METADATA_KEYS} def observability_metadata( @@ -88,7 +101,7 @@ def _build_events_for_matches( for match in matches: control_def = control_lookup.get(match.control_id) - event_metadata = dict(match.result.metadata or {}) + event_metadata = _safe_event_metadata(dict(match.result.metadata or {})) selector_path = None evaluator_name = None diff --git a/sdks/python/src/agent_control/otel_sink.py b/sdks/python/src/agent_control/otel_sink.py index a238dac6..e724f5af 100644 --- a/sdks/python/src/agent_control/otel_sink.py +++ b/sdks/python/src/agent_control/otel_sink.py @@ -28,6 +28,14 @@ "OpenTelemetry sink selected but no OTLP exporter configuration was found; " "control events will not be exported" ) +_DEBUG_METADATA_ATTRIBUTE_KEYS = frozenset( + { + "selected_data", + "selected_data_preview", + "engine_selected_data", + "engine_selected_data_preview", + } +) AttributeValue = str | bool | int | float | list[str] | list[bool] | list[int] | list[float] @@ -129,6 +137,8 @@ def control_event_to_otel_span(event: ControlExecutionEvent) -> OTELControlEvent attributes["agent_control.error_message"] = event.error_message for key, value in sorted(event.metadata.items()): + if key in _DEBUG_METADATA_ATTRIBUTE_KEYS: + continue attributes[f"agent_control.metadata.{key}"] = _normalize_attribute_value(value) return OTELControlEventSpan( diff --git a/sdks/python/tests/test_observability_updates.py b/sdks/python/tests/test_observability_updates.py index 181d3c6c..dd7f5d2f 100644 --- a/sdks/python/tests/test_observability_updates.py +++ b/sdks/python/tests/test_observability_updates.py @@ -67,14 +67,21 @@ def _make_response(self, **kwargs): defaults.update(kwargs) return EvaluationResponse(**defaults) - def _make_match(self, control_id, control_name="ctrl", action="observe", matched=True): + def _make_match( + self, + control_id, + control_name="ctrl", + action="observe", + matched=True, + metadata=None, + ): from agent_control_models import ControlMatch, EvaluatorResult return ControlMatch( control_id=control_id, control_name=control_name, action=action, - result=EvaluatorResult(matched=matched, confidence=0.9), + result=EvaluatorResult(matched=matched, confidence=0.9, metadata=metadata), ) def test_combines_matches_errors_and_non_matches(self): @@ -172,14 +179,21 @@ def _make_request(self, step_type="llm"): stage="pre", ) - def _make_match(self, control_id, control_name="ctrl", action="observe", matched=True): + def _make_match( + self, + control_id, + control_name="ctrl", + action="observe", + matched=True, + metadata=None, + ): from agent_control_models import ControlMatch, EvaluatorResult return ControlMatch( control_id=control_id, control_name=control_name, action=action, - result=EvaluatorResult(matched=matched, confidence=0.9), + result=EvaluatorResult(matched=matched, confidence=0.9, metadata=metadata), ) def _make_response(self, matches=None, errors=None, non_matches=None): @@ -224,6 +238,56 @@ def test_builds_events_with_trace_context(self): assert event.evaluator_name == "regex" assert event.selector_path == "input" + def test_drops_raw_selected_data_from_event_metadata(self): + response = self._make_response( + matches=[ + self._make_match( + 1, + "ctrl-1", + metadata={ + "selected_data": {"prompt": "raw sensitive input"}, + "selected_data_preview": { + "type": "dict", + "value": {"prompt": "raw sensitive input"}, + "truncated": False, + }, + "engine_selected_data": {"prompt": "raw sensitive input"}, + "engine_selected_data_preview": { + "type": "dict", + "value": {"prompt": "raw sensitive input"}, + "truncated": False, + }, + }, + ) + ] + ) + request = self._make_request() + control_lookup = { + 1: self._make_control( + 1, + "ctrl-1", + { + "evaluator": {"name": "regex", "config": {"pattern": "test"}}, + "selector": {"path": "input"}, + }, + ).control + } + + events = build_control_execution_events( + response, + request, + control_lookup, + "trace123", + "span456", + "test-agent", + ) + + assert len(events) == 1 + assert "selected_data" not in events[0].metadata + assert "selected_data_preview" not in events[0].metadata + assert "engine_selected_data" not in events[0].metadata + assert "engine_selected_data_preview" not in events[0].metadata + def test_composite_control_uses_representative_observability_identity(self): response = self._make_response(non_matches=[self._make_match(1, "ctrl-1", matched=False)]) request = self._make_request() diff --git a/sdks/python/tests/test_otel_sink.py b/sdks/python/tests/test_otel_sink.py index 6f1c81fd..4d4aa451 100644 --- a/sdks/python/tests/test_otel_sink.py +++ b/sdks/python/tests/test_otel_sink.py @@ -39,7 +39,23 @@ def _make_event(**overrides: object) -> ControlExecutionEvent: evaluator_name="regex", selector_path="input", error_message=None, - metadata={"labels": ["security", "pii"], "threshold": 3, "nested": {"k": "v"}}, + metadata={ + "labels": ["security", "pii"], + "threshold": 3, + "nested": {"k": "v"}, + "selected_data": {"prompt": "raw sensitive input"}, + "selected_data_preview": { + "type": "dict", + "value": {"prompt": "raw sensitive input"}, + "truncated": False, + }, + "engine_selected_data": {"prompt": "raw sensitive input"}, + "engine_selected_data_preview": { + "type": "dict", + "value": {"prompt": "raw sensitive input"}, + "truncated": False, + }, + }, ) return event.model_copy(update=overrides) @@ -227,6 +243,10 @@ def test_control_event_to_otel_span_maps_event_fields() -> None: assert span.attributes["agent_control.matched"] is True assert span.attributes["agent_control.metadata.labels"] == ["security", "pii"] assert span.attributes["agent_control.metadata.nested"] == '{"k": "v"}' + assert "agent_control.metadata.selected_data" not in span.attributes + assert "agent_control.metadata.selected_data_preview" not in span.attributes + assert "agent_control.metadata.engine_selected_data" not in span.attributes + assert "agent_control.metadata.engine_selected_data_preview" not in span.attributes assert span.error_message == "blocked" assert span.end_time_unix_nano >= span.start_time_unix_nano