From 9f1791a91899b8e016d8be18cde8923a770c1a56 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Wed, 6 May 2026 12:47:34 -0700
Subject: [PATCH 01/20] add new lluna client

---
 evaluators/contrib/galileo/pyproject.toml     |   1 +
 .../__init__.py                               |  17 +
 .../luna/__init__.py                          |  19 ++
 .../luna/client.py                            | 256 +++++++++++++++
 .../luna/config.py                            |  94 ++++++
 .../luna/evaluator.py                         | 259 ++++++++++++++++
 .../agent_control_evaluator_galileo/py.typed  |   1 +
 .../galileo/tests/test_luna_evaluator.py      | 291 ++++++++++++++++++
 examples/README.md                            |   1 +
 examples/galileo_luna/README.md               |  46 +++
 examples/galileo_luna/demo_agent.py           | 129 ++++++++
 examples/galileo_luna/pyproject.toml          |  25 ++
 examples/galileo_luna/setup_controls.py       | 198 ++++++++++++
 .../src/agent_control/evaluators/__init__.py  |  28 +-
 14 files changed, 1363 insertions(+), 2 deletions(-)
 create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py
 create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
 create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
 create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
 create mode 100644 evaluators/contrib/galileo/src/agent_control_evaluator_galileo/py.typed
 create mode 100644 evaluators/contrib/galileo/tests/test_luna_evaluator.py
 create mode 100644 examples/galileo_luna/README.md
 create mode 100644 examples/galileo_luna/demo_agent.py
 create mode 100644 examples/galileo_luna/pyproject.toml
 create mode 100644 examples/galileo_luna/setup_controls.py

diff --git a/evaluators/contrib/galileo/pyproject.toml b/evaluators/contrib/galileo/pyproject.toml
index ff70f2fb..21b1accc 100644
--- a/evaluators/contrib/galileo/pyproject.toml
+++ b/evaluators/contrib/galileo/pyproject.toml
@@ -23,6 +23,7 @@ dev = [
 ]
 
 [project.entry-points."agent_control.evaluators"]
+"galileo.luna" = "agent_control_evaluator_galileo.luna:LunaEvaluator"
 "galileo.luna2" = "agent_control_evaluator_galileo.luna2:Luna2Evaluator"
 
 [build-system]
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py
index 6389087f..d9269fe1 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py
@@ -3,6 +3,7 @@
 This package provides Galileo evaluators for agent-control.
 
 Available evaluators:
+    - galileo.luna: Galileo Luna direct scorer evaluation
     - galileo.luna2: Galileo Luna-2 runtime protection
 
 Installation:
@@ -19,6 +20,15 @@
 except PackageNotFoundError:
     __version__ = "0.0.0.dev"
 
+from agent_control_evaluator_galileo.luna import (
+    LUNA_AVAILABLE,
+    GalileoLunaClient,
+    LunaEvaluator,
+    LunaEvaluatorConfig,
+    LunaOperator,
+    ScorerInvokeRequest,
+    ScorerInvokeResponse,
+)
 from agent_control_evaluator_galileo.luna2 import (
     LUNA2_AVAILABLE,
     Luna2Evaluator,
@@ -28,6 +38,13 @@
 )
 
 __all__ = [
+    "GalileoLunaClient",
+    "ScorerInvokeRequest",
+    "ScorerInvokeResponse",
+    "LunaEvaluator",
+    "LunaEvaluatorConfig",
+    "LunaOperator",
+    "LUNA_AVAILABLE",
     "Luna2Evaluator",
     "Luna2EvaluatorConfig",
     "Luna2Metric",
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py
new file mode 100644
index 00000000..c3ff0375
--- /dev/null
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py
@@ -0,0 +1,19 @@
+"""Galileo Luna direct scorer evaluator."""
+
+from agent_control_evaluator_galileo.luna.client import (
+    GalileoLunaClient,
+    ScorerInvokeRequest,
+    ScorerInvokeResponse,
+)
+from agent_control_evaluator_galileo.luna.config import LunaEvaluatorConfig, LunaOperator
+from agent_control_evaluator_galileo.luna.evaluator import LUNA_AVAILABLE, LunaEvaluator
+
+__all__ = [
+    "GalileoLunaClient",
+    "ScorerInvokeRequest",
+    "ScorerInvokeResponse",
+    "LunaEvaluatorConfig",
+    "LunaOperator",
+    "LunaEvaluator",
+    "LUNA_AVAILABLE",
+]
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
new file mode 100644
index 00000000..e1638ae3
--- /dev/null
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -0,0 +1,256 @@
+"""Direct HTTP client for Galileo Luna scorer invocation."""
+
+from __future__ import annotations
+
+import logging
+import os
+from dataclasses import dataclass, field
+from uuid import UUID
+
+import httpx
+from agent_control_models import JSONObject, JSONValue
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_TIMEOUT_SECS = 10.0
+
+
+def _as_float_or_none(value: JSONValue) -> float | None:
+    if isinstance(value, bool) or value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        try:
+            return float(value)
+        except ValueError:
+            return None
+    return None
+
+
+@dataclass(frozen=True)
+class ScorerInvokeRequest:
+    """Request payload for Galileo Luna scorer invocation.
+
+    Attributes:
+        metric: Preset, registered, or fine-tuned scorer name.
+        input: Optional user/system prompt text.
+        output: Optional model response text.
+        luna_model: Optional Luna model override.
+        project_id: Optional Galileo project UUID for project-scoped scorer resolution.
+        config: Optional scorer-specific configuration.
+    """
+
+    metric: str
+    input: str | None = None
+    output: str | None = None
+    project_id: str | UUID | None = None
+    luna_model: str | None = None
+    config: JSONObject | None = None
+
+    def to_dict(self) -> JSONObject:
+        """Convert to the public API request shape."""
+        body: JSONObject = {"metric": self.metric}
+        if self.input is not None:
+            body["input"] = self.input
+        if self.output is not None:
+            body["output"] = self.output
+        if self.project_id is not None:
+            body["project_id"] = str(self.project_id)
+        if self.luna_model is not None:
+            body["luna_model"] = self.luna_model
+        if self.config is not None:
+            body["config"] = self.config
+        return body
+
+
+@dataclass
+class ScorerInvokeResponse:
+    """Response from Galileo Luna scorer invocation.
+
+    Attributes:
+        metric: Echoed scorer metric.
+        score: Raw scorer value.
+        status: Invocation status.
+        execution_time: Execution time in seconds, when returned.
+        error_message: Error detail for non-success statuses.
+        raw_response: Full response body for diagnostics.
+    """
+
+    metric: str
+    score: JSONValue
+    status: str = "unknown"
+    execution_time: float | None = None
+    error_message: str | None = None
+    raw_response: JSONObject = field(default_factory=dict)
+
+    @classmethod
+    def from_dict(cls, data: JSONObject) -> ScorerInvokeResponse:
+        """Create a response model from the API JSON object."""
+        metric_value = data.get("metric", "")
+        status_value = data.get("status", "unknown")
+        error_value = data.get("error_message")
+
+        return cls(
+            metric=str(metric_value) if metric_value is not None else "",
+            score=data.get("score"),
+            status=str(status_value) if status_value is not None else "unknown",
+            execution_time=_as_float_or_none(data.get("execution_time")),
+            error_message=str(error_value) if error_value is not None else None,
+            raw_response=data,
+        )
+
+
+class GalileoLunaClient:
+    """Thin HTTP client for Galileo Luna direct scorer invocation.
+
+    Environment Variables:
+        GALILEO_API_KEY: Galileo API key (required).
+        GALILEO_CONSOLE_URL: Galileo Console URL (optional, defaults to production).
+    """
+
+    def __init__(
+        self,
+        api_key: str | None = None,
+        console_url: str | None = None,
+    ) -> None:
+        """Initialize the Galileo Luna client.
+
+        Args:
+            api_key: Galileo API key. If not provided, reads from GALILEO_API_KEY.
+            console_url: Galileo Console URL. If not provided, reads from
+                GALILEO_CONSOLE_URL or uses the production console URL.
+
+        Raises:
+            ValueError: If no API key is provided or found in the environment.
+        """
+        resolved_api_key = api_key or os.getenv("GALILEO_API_KEY")
+        if not resolved_api_key:
+            raise ValueError(
+                "GALILEO_API_KEY is required. "
+                "Set it as an environment variable or pass it to the constructor."
+            )
+
+        self.api_key = resolved_api_key
+        self.console_url = (
+            console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai"
+        )
+        self.api_base = self._derive_api_url(self.console_url)
+        self._client: httpx.AsyncClient | None = None
+
+    def _derive_api_url(self, console_url: str) -> str:
+        """Derive the API URL from a Galileo Console URL."""
+        url = console_url.rstrip("/")
+
+        if "console." in url:
+            return url.replace("console.", "api.")
+
+        if url.startswith("https://"):
+            return url.replace("https://", "https://api.")
+        if url.startswith("http://"):
+            return url.replace("http://", "http://api.")
+
+        return url
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create the HTTP client."""
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(
+                headers={
+                    "Galileo-API-Key": self.api_key,
+                    "Content-Type": "application/json",
+                },
+                timeout=httpx.Timeout(DEFAULT_TIMEOUT_SECS),
+            )
+        return self._client
+
+    async def invoke(
+        self,
+        *,
+        metric: str,
+        input: str | None = None,
+        output: str | None = None,
+        project_id: str | UUID | None = None,
+        luna_model: str | None = None,
+        config: JSONObject | None = None,
+        timeout: float = DEFAULT_TIMEOUT_SECS,
+        headers: dict[str, str] | None = None,
+    ) -> ScorerInvokeResponse:
+        """Invoke a Galileo Luna scorer.
+
+        Args:
+            metric: Preset, registered, or fine-tuned scorer name.
+            input: Optional user/system prompt text.
+            output: Optional model response text.
+            project_id: Optional Galileo project UUID for project-scoped scorer resolution.
+            luna_model: Optional Luna model override.
+            config: Optional scorer-specific configuration.
+            timeout: Request timeout in seconds.
+            headers: Additional request headers.
+
+        Returns:
+            Parsed scorer invocation response.
+
+        Raises:
+            ValueError: If neither input nor output is provided.
+            RuntimeError: If the API response is not a JSON object.
+            httpx.HTTPStatusError: If the API returns an error status code.
+            httpx.RequestError: If the request fails before a response is received.
+        """
+        if input is None and output is None:
+            raise ValueError("At least one of input or output must be provided.")
+
+        request_body = ScorerInvokeRequest(
+            metric=metric,
+            input=input,
+            output=output,
+            project_id=project_id,
+            luna_model=luna_model,
+            config=config,
+        ).to_dict()
+        request_headers = dict(headers or {})
+        endpoint = f"{self.api_base}/scorers/invoke"
+
+        logger.debug("[GalileoLunaClient] POST %s", endpoint)
+        logger.debug("[GalileoLunaClient] Request body: %s", request_body)
+
+        try:
+            client = await self._get_client()
+            response = await client.post(
+                endpoint,
+                json=request_body,
+                headers=request_headers,
+                timeout=timeout,
+            )
+            response.raise_for_status()
+            response_data = response.json()
+            if not isinstance(response_data, dict):
+                raise RuntimeError("Invalid response payload: not a JSON object")
+
+            parsed = ScorerInvokeResponse.from_dict(response_data)
+            logger.debug("[GalileoLunaClient] Response: %s", parsed.raw_response)
+            return parsed
+        except httpx.HTTPStatusError as exc:
+            logger.error(
+                "[GalileoLunaClient] API error: %s - %s",
+                exc.response.status_code,
+                exc.response.text,
+            )
+            raise
+        except httpx.RequestError as exc:
+            logger.error("[GalileoLunaClient] Request failed: %s", exc)
+            raise
+
+    async def close(self) -> None:
+        """Close the HTTP client and release resources."""
+        if self._client is not None:
+            await self._client.aclose()
+            self._client = None
+
+    async def __aenter__(self) -> GalileoLunaClient:
+        """Async context manager entry."""
+        return self
+
+    async def __aexit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
+        """Async context manager exit."""
+        await self.close()
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
new file mode 100644
index 00000000..241e040f
--- /dev/null
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
@@ -0,0 +1,94 @@
+"""Configuration model for direct Galileo Luna scorer evaluation."""
+
+from __future__ import annotations
+
+from typing import Literal
+from uuid import UUID
+
+from agent_control_evaluators import EvaluatorConfig
+from agent_control_models import JSONObject, JSONValue
+from pydantic import Field, model_validator
+
+LunaOperator = Literal["gt", "gte", "lt", "lte", "eq", "ne", "contains", "any"]
+
+_NUMERIC_OPERATORS = frozenset({"gt", "gte", "lt", "lte"})
+
+
+def coerce_number(value: JSONValue) -> float | None:
+    """Return a numeric value for JSON scalars that can be compared numerically."""
+    if isinstance(value, bool) or value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        try:
+            return float(value)
+        except ValueError:
+            return None
+    return None
+
+
+class LunaEvaluatorConfig(EvaluatorConfig):
+    """Configuration for direct Luna scorer evaluation.
+
+    Attributes:
+        metric: Preset, registered, or fine-tuned scorer name.
+        project_id: Optional Galileo project UUID for project-scoped scorer resolution.
+        threshold: Local threshold used by the evaluator for comparison.
+        operator: Local comparison operator. Numeric operators use threshold as a number.
+        luna_model: Optional Luna model override sent to Galileo.
+        scorer_config: Optional scorer-specific config sent as ``config``.
+        timeout_ms: Request timeout in milliseconds.
+        on_error: Error policy: allow=fail open, deny=fail closed.
+        payload_field: Force selected data into input or output. If omitted, root step
+            payloads with input/output use both fields; scalar data is inferred from metric name.
+        include_raw_response: Include the raw API response in EvaluatorResult metadata.
+    """
+
+    metric: str = Field(..., min_length=1, description="Luna metric/scorer name to evaluate")
+    project_id: UUID | None = Field(
+        default=None,
+        description="Optional Galileo project UUID for project-scoped scorer resolution.",
+    )
+    threshold: JSONValue = Field(
+        default=0.5,
+        description="Local threshold used to decide whether the control matches.",
+    )
+    operator: LunaOperator = Field(
+        default="gte",
+        description="Local comparison operator applied to the raw Luna score.",
+    )
+    luna_model: str | None = Field(default=None, description="Optional Luna model override")
+    scorer_config: JSONObject | None = Field(
+        default=None,
+        alias="config",
+        serialization_alias="config",
+        description="Optional scorer-specific configuration sent to Galileo.",
+    )
+    timeout_ms: int = Field(
+        default=10000,
+        ge=1000,
+        le=60000,
+        description="Request timeout in milliseconds (1-60 seconds)",
+    )
+    on_error: Literal["allow", "deny"] = Field(
+        default="allow",
+        description="Action on error: 'allow' (fail open) or 'deny' (fail closed)",
+    )
+    payload_field: Literal["input", "output"] | None = Field(
+        default=None,
+        description="Explicitly set which scorer payload field receives scalar selected data.",
+    )
+    include_raw_response: bool = Field(
+        default=False,
+        description="Include the raw scorer response in result metadata.",
+    )
+
+    @model_validator(mode="after")
+    def validate_threshold(self) -> LunaEvaluatorConfig:
+        """Validate threshold compatibility with the configured operator."""
+        if self.operator in _NUMERIC_OPERATORS and coerce_number(self.threshold) is None:
+            raise ValueError(f"operator '{self.operator}' requires a numeric threshold")
+        if self.operator != "any" and self.threshold is None:
+            raise ValueError("threshold is required unless operator is 'any'")
+        return self
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
new file mode 100644
index 00000000..16a39930
--- /dev/null
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -0,0 +1,259 @@
+"""Direct Galileo Luna evaluator implementation."""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from importlib.metadata import PackageNotFoundError, version
+from typing import Any
+
+from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator
+from agent_control_models import EvaluatorResult, JSONValue
+
+from .client import GalileoLunaClient, ScorerInvokeResponse
+from .config import LunaEvaluatorConfig, coerce_number
+
+logger = logging.getLogger(__name__)
+
+
+def _resolve_package_version() -> str:
+    """Return the installed package version, or a dev fallback during local imports."""
+    try:
+        return version("agent-control-evaluator-galileo")
+    except PackageNotFoundError:
+        return "0.0.0.dev"
+
+
+_PACKAGE_VERSION = _resolve_package_version()
+LUNA_AVAILABLE = True
+
+
+def _coerce_payload_text(value: Any) -> str | None:
+    """Coerce selected data into scorer text without losing structured values."""
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    if isinstance(value, (int, float, bool)):
+        return str(value)
+    try:
+        return json.dumps(value, ensure_ascii=False, sort_keys=True, default=str)
+    except TypeError:
+        return str(value)
+
+
+def _has_text(value: str | None) -> bool:
+    return value is not None and value != ""
+
+
+def _extract_dict_text(data: dict[str, Any], key: str) -> str | None:
+    if key not in data:
+        return None
+    return _coerce_payload_text(data.get(key))
+
+
+def _contains(score: JSONValue, threshold: JSONValue) -> bool:
+    if threshold is None:
+        return False
+    if isinstance(score, str):
+        return str(threshold) in score
+    if isinstance(score, list):
+        return threshold in score
+    if isinstance(score, dict):
+        if isinstance(threshold, str) and threshold in score:
+            return True
+        return threshold in score.values()
+    return False
+
+
+def _confidence_from_score(score: JSONValue) -> float:
+    if isinstance(score, bool):
+        return 1.0 if score else 0.0
+    number = coerce_number(score)
+    if number is not None and 0.0 <= number <= 1.0:
+        return number
+    return 1.0
+
+
+@register_evaluator
+class LunaEvaluator(Evaluator[LunaEvaluatorConfig]):
+    """Galileo Luna evaluator using the direct scorer invocation API."""
+
+    metadata = EvaluatorMetadata(
+        name="galileo.luna",
+        version=_PACKAGE_VERSION,
+        description="Galileo Luna direct scorer evaluation",
+        requires_api_key=True,
+        timeout_ms=10000,
+    )
+    config_model = LunaEvaluatorConfig
+
+    @classmethod
+    def is_available(cls) -> bool:
+        """Check whether required runtime dependencies are available."""
+        return LUNA_AVAILABLE
+
+    def __init__(self, config: LunaEvaluatorConfig) -> None:
+        """Initialize the direct Luna evaluator.
+
+        Args:
+            config: Validated LunaEvaluatorConfig instance.
+
+        Raises:
+            ValueError: If GALILEO_API_KEY is not set.
+        """
+        if not os.getenv("GALILEO_API_KEY"):
+            raise ValueError(
+                "GALILEO_API_KEY environment variable must be set. "
+                "Set it to a Galileo API key before using galileo.luna."
+            )
+
+        super().__init__(config)
+        self._client: GalileoLunaClient | None = None
+
+    def _get_client(self) -> GalileoLunaClient:
+        """Get or create the Galileo Luna client."""
+        if self._client is None:
+            self._client = GalileoLunaClient()
+        return self._client
+
+    def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]:
+        """Prepare scorer input/output fields from selected data."""
+        if self.config.payload_field is not None:
+            text = _coerce_payload_text(data)
+            if self.config.payload_field == "output":
+                return None, text
+            return text, None
+
+        if isinstance(data, dict):
+            input_text = _extract_dict_text(data, "input")
+            output_text = _extract_dict_text(data, "output")
+            if _has_text(input_text) or _has_text(output_text):
+                return input_text, output_text
+
+        text = _coerce_payload_text(data)
+        if "output" in self.config.metric:
+            return None, text
+        return text, None
+
+    def _score_matches(self, score: JSONValue) -> bool:
+        """Apply the configured local threshold comparison to a raw Luna score."""
+        operator = self.config.operator
+        threshold = self.config.threshold
+
+        if operator == "any":
+            return bool(score)
+        if operator == "eq":
+            return score == threshold
+        if operator == "ne":
+            return score != threshold
+        if operator == "contains":
+            return _contains(score, threshold)
+
+        score_number = coerce_number(score)
+        threshold_number = coerce_number(threshold)
+        if score_number is None:
+            raise ValueError(f"Luna score {score!r} is not numeric")
+        if threshold_number is None:
+            raise ValueError(f"Luna threshold {threshold!r} is not numeric")
+
+        if operator == "gt":
+            return score_number > threshold_number
+        if operator == "gte":
+            return score_number >= threshold_number
+        if operator == "lt":
+            return score_number < threshold_number
+        if operator == "lte":
+            return score_number <= threshold_number
+
+        raise ValueError(f"Unsupported Luna operator: {operator}")
+
+    async def evaluate(self, data: Any) -> EvaluatorResult:
+        """Evaluate selected data with Galileo Luna direct scorer invocation.
+
+        Args:
+            data: The data selected from the runtime step.
+
+        Returns:
+            EvaluatorResult with local threshold decision and scorer metadata.
+        """
+        input_text, output_text = self._prepare_payload(data)
+        if not (_has_text(input_text) or _has_text(output_text)):
+            return EvaluatorResult(
+                matched=False,
+                confidence=1.0,
+                message="No data to score with Luna",
+                metadata={"metric": self.config.metric},
+            )
+
+        try:
+            response = await self._get_client().invoke(
+                metric=self.config.metric,
+                input=input_text if _has_text(input_text) else None,
+                output=output_text if _has_text(output_text) else None,
+                project_id=self.config.project_id,
+                luna_model=self.config.luna_model,
+                config=self.config.scorer_config,
+                timeout=self.get_timeout_seconds(),
+            )
+
+            if response.status.lower() != "success":
+                message = response.error_message or f"Luna scorer status: {response.status}"
+                raise RuntimeError(message)
+
+            matched = self._score_matches(response.score)
+            metadata = self._metadata(response)
+            operator = self.config.operator
+            threshold = self.config.threshold
+            state = "triggered" if matched else "not triggered"
+            return EvaluatorResult(
+                matched=matched,
+                confidence=_confidence_from_score(response.score),
+                message=(
+                    f"Luna score {response.score!r} {operator} threshold "
+                    f"{threshold!r}: control {state}."
+                ),
+                metadata=metadata,
+            )
+        except Exception as exc:
+            logger.error("Luna evaluation error: %s", exc, exc_info=True)
+            return self._handle_error(exc)
+
+    def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]:
+        metadata: dict[str, Any] = {
+            "metric": response.metric or self.config.metric,
+            "project_id": str(self.config.project_id) if self.config.project_id else None,
+            "score": response.score,
+            "threshold": self.config.threshold,
+            "operator": self.config.operator,
+            "status": response.status,
+            "execution_time_seconds": response.execution_time,
+            "error_message": response.error_message,
+        }
+        if self.config.include_raw_response:
+            metadata["raw_response"] = response.raw_response
+        return metadata
+
+    def _handle_error(self, error: Exception) -> EvaluatorResult:
+        fallback = self.config.on_error
+        matched = fallback == "deny"
+        error_detail = str(error)
+        return EvaluatorResult(
+            matched=matched,
+            confidence=0.0,
+            message=f"Luna evaluation error: {error_detail}",
+            metadata={
+                "error": error_detail,
+                "error_type": type(error).__name__,
+                "metric": self.config.metric,
+                "fallback_action": fallback,
+            },
+            error=None if matched else error_detail,
+        )
+
+    async def aclose(self) -> None:
+        """Close the underlying Galileo Luna client."""
+        if self._client is not None:
+            await self._client.close()
+            self._client = None
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/py.typed b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/py.typed
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/py.typed
@@ -0,0 +1 @@
+
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
new file mode 100644
index 00000000..6ca0dced
--- /dev/null
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -0,0 +1,291 @@
+"""Tests for the direct Galileo Luna evaluator and client."""
+
+from __future__ import annotations
+
+import json
+import os
+from unittest.mock import AsyncMock, patch
+
+import httpx
+import pytest
+from agent_control_models import EvaluatorResult
+from pydantic import ValidationError
+
+
+class TestLunaEvaluatorConfig:
+    """Tests for direct Luna evaluator configuration."""
+
+    def test_config_accepts_direct_scorer_fields(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig
+
+        # Given: a direct scorer config with local thresholding
+        config = LunaEvaluatorConfig(
+            metric="toxicity",
+            project_id="12345678-1234-5678-1234-567812345678",
+            threshold=0.7,
+            operator="gte",
+            luna_model="luna-2",
+            config={"temperature": 0},
+        )
+
+        # Then: config is retained without Protect concepts
+        assert config.metric == "toxicity"
+        assert str(config.project_id) == "12345678-1234-5678-1234-567812345678"
+        assert config.threshold == 0.7
+        assert config.operator == "gte"
+        assert config.luna_model == "luna-2"
+        assert config.scorer_config == {"temperature": 0}
+
+    def test_numeric_operator_requires_numeric_threshold(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig
+
+        # Given/When/Then: numeric local comparison rejects non-numeric thresholds
+        with pytest.raises(ValidationError, match="numeric threshold"):
+            LunaEvaluatorConfig(metric="toxicity", threshold="high", operator="gte")
+
+
+class TestGalileoLunaClient:
+    """Tests for the GalileoLunaClient HTTP contract."""
+
+    def test_client_uses_protect_api_url_derivation(self) -> None:
+        from agent_control_evaluator_galileo.luna import GalileoLunaClient
+
+        # Given: the same console URL shape used by Protect
+        with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}):
+            client = GalileoLunaClient(console_url="https://console.demo-v2.galileocloud.io")
+
+        # Then: the API URL is derived the same way
+        assert client.api_base == "https://api.demo-v2.galileocloud.io"
+
+    @pytest.mark.asyncio
+    async def test_client_posts_to_scorers_invoke_without_protect_fields(self) -> None:
+        from agent_control_evaluator_galileo.luna import GalileoLunaClient
+
+        captured: dict[str, object] = {}
+
+        def handler(request: httpx.Request) -> httpx.Response:
+            captured["url"] = str(request.url)
+            captured["headers"] = dict(request.headers)
+            captured["body"] = json.loads(request.content.decode())
+            return httpx.Response(
+                200,
+                json={
+                    "metric": "toxicity",
+                    "score": 0.82,
+                    "status": "success",
+                    "execution_time": 0.12,
+                },
+            )
+
+        # Given: a Luna client with a mock HTTP transport
+        with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}):
+            client = GalileoLunaClient(console_url="https://console.demo-v2.galileocloud.io")
+        client._client = httpx.AsyncClient(
+            transport=httpx.MockTransport(handler),
+            headers={
+                "Galileo-API-Key": client.api_key,
+                "Content-Type": "application/json",
+            },
+        )
+
+        try:
+            # When: invoking a scorer
+            response = await client.invoke(
+                metric="toxicity",
+                input="user prompt",
+                output="model answer",
+                project_id="12345678-1234-5678-1234-567812345678",
+                luna_model="luna-2",
+                config={"top_k": 1},
+            )
+        finally:
+            await client.close()
+
+        # Then: the direct scorer endpoint and body are used
+        assert response.score == 0.82
+        assert captured["url"] == "https://api.demo-v2.galileocloud.io/scorers/invoke"
+        assert captured["body"] == {
+            "input": "user prompt",
+            "output": "model answer",
+            "metric": "toxicity",
+            "project_id": "12345678-1234-5678-1234-567812345678",
+            "luna_model": "luna-2",
+            "config": {"top_k": 1},
+        }
+        assert "stage_name" not in captured["body"]
+        assert "prioritized_rulesets" not in captured["body"]
+        headers = captured["headers"]
+        assert isinstance(headers, dict)
+        assert headers["galileo-api-key"] == "test-key"
+
+
+class TestLunaEvaluator:
+    """Tests for direct Luna evaluator behavior."""
+
+    @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
+    def test_evaluator_metadata(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+        assert LunaEvaluator.metadata.name == "galileo.luna"
+        assert LunaEvaluator.metadata.requires_api_key is True
+
+    @patch.dict(os.environ, {}, clear=True)
+    def test_evaluator_init_without_api_key_raises(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+        with pytest.raises(ValueError, match="GALILEO_API_KEY"):
+            LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
+
+    @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
+    @pytest.mark.asyncio
+    async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse
+        from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+        # Given: a direct Luna evaluator and a raw successful scorer response
+        evaluator = LunaEvaluator.from_dict(
+            {
+                "metric": "toxicity",
+                "project_id": "12345678-1234-5678-1234-567812345678",
+                "threshold": 0.7,
+                "operator": "gte",
+                "timeout_ms": 5000,
+            }
+        )
+
+        with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
+            mock_invoke.return_value = ScorerInvokeResponse(
+                metric="toxicity",
+                score=0.82,
+                status="success",
+                execution_time=0.1,
+            )
+
+            # When: evaluating a full step payload
+            result = await evaluator.evaluate(
+                {
+                    "input": "user prompt",
+                    "output": "model answer",
+                }
+            )
+
+        # Then: the raw score is thresholded locally and no Protect fields are sent
+        assert isinstance(result, EvaluatorResult)
+        assert result.matched is True
+        assert result.confidence == 0.82
+        assert result.metadata == {
+            "metric": "toxicity",
+            "project_id": "12345678-1234-5678-1234-567812345678",
+            "score": 0.82,
+            "threshold": 0.7,
+            "operator": "gte",
+            "status": "success",
+            "execution_time_seconds": 0.1,
+            "error_message": None,
+        }
+        mock_invoke.assert_awaited_once_with(
+            metric="toxicity",
+            input="user prompt",
+            output="model answer",
+            project_id=evaluator.config.project_id,
+            luna_model=None,
+            config=None,
+            timeout=5.0,
+        )
+
+    @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
+    @pytest.mark.asyncio
+    async def test_evaluator_returns_non_match_below_threshold(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse
+        from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+        # Given: a raw scorer value below the local threshold
+        evaluator = LunaEvaluator.from_dict(
+            {"metric": "toxicity", "threshold": 0.7, "operator": "gte"}
+        )
+
+        with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
+            mock_invoke.return_value = ScorerInvokeResponse(
+                metric="toxicity",
+                score=0.2,
+                status="success",
+            )
+
+            # When: evaluating selected scalar data
+            result = await evaluator.evaluate("hello")
+
+        # Then: the control does not match
+        assert result.matched is False
+        assert result.confidence == 0.2
+        mock_invoke.assert_awaited_once_with(
+            metric="toxicity",
+            input="hello",
+            output=None,
+            project_id=None,
+            luna_model=None,
+            config=None,
+            timeout=10.0,
+        )
+
+    @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
+    @pytest.mark.asyncio
+    async def test_evaluator_does_not_call_api_for_empty_data(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+        from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+        # Given: an evaluator and empty selected data
+        evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
+
+        with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
+            # When: evaluating empty data
+            result = await evaluator.evaluate("")
+
+        # Then: no remote scorer call is made
+        assert result.matched is False
+        assert result.confidence == 1.0
+        assert result.message == "No data to score with Luna"
+        mock_invoke.assert_not_called()
+
+    @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
+    @pytest.mark.asyncio
+    async def test_evaluator_fail_open_sets_error(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+        from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+        # Given: default fail-open behavior
+        evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
+
+        with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
+            mock_invoke.side_effect = RuntimeError("service unavailable")
+
+            # When: the scorer call fails
+            result = await evaluator.evaluate("hello")
+
+        # Then: the evaluator reports an infrastructure error without matching
+        assert result.matched is False
+        assert result.error == "service unavailable"
+        assert result.metadata is not None
+        assert result.metadata["fallback_action"] == "allow"
+
+    @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
+    @pytest.mark.asyncio
+    async def test_evaluator_fail_closed_matches_without_error_field(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+        from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+        # Given: fail-closed behavior for scorer errors
+        evaluator = LunaEvaluator.from_dict(
+            {"metric": "toxicity", "threshold": 0.5, "on_error": "deny"}
+        )
+
+        with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
+            mock_invoke.side_effect = RuntimeError("service unavailable")
+
+            # When: the scorer call fails
+            result = await evaluator.evaluate("hello")
+
+        # Then: the control matches so deny/steer actions can be applied by the engine
+        assert result.matched is True
+        assert result.error is None
+        assert result.metadata is not None
+        assert result.metadata["fallback_action"] == "deny"
diff --git a/examples/README.md b/examples/README.md
index 2f488d19..a329dbe7 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -14,6 +14,7 @@ This directory contains runnable examples for Agent Control. Each example has it
 | Customer Support Agent | Enterprise scenario with PII protection, prompt-injection defense, and multiple tools. | https://docs.agentcontrol.dev/examples/customer-support |
 | DeepEval | Build a custom evaluator using DeepEval GEval metrics. | https://docs.agentcontrol.dev/examples/deepeval |
 | Galileo Luna-2 | Toxicity detection and content moderation with Galileo Protect. | https://docs.agentcontrol.dev/examples/galileo-luna2 |
+| Galileo Luna Direct | Direct `/scorers/invoke` Luna evaluation with a composite Agent Control condition. | `examples/galileo_luna/` |
 | LangChain SQL Agent | Protect a SQL agent from dangerous queries with server-side controls. | https://docs.agentcontrol.dev/examples/langchain-sql |
 | Steer Action Demo | Banking transfer agent showcasing observe, deny, and steer actions. | https://docs.agentcontrol.dev/examples/steer-action-demo |
 | Target Context | Bind controls to opaque external targets (e.g. `env=prod`) and let the SDK pin one target per session. | https://docs.agentcontrol.dev/examples/target-context |
diff --git a/examples/galileo_luna/README.md b/examples/galileo_luna/README.md
new file mode 100644
index 00000000..d43a2d71
--- /dev/null
+++ b/examples/galileo_luna/README.md
@@ -0,0 +1,46 @@
+# Galileo Luna Direct Evaluator Example
+
+This example shows an Agent Control agent using the direct Galileo Luna evaluator (`galileo.luna`). The evaluator calls Galileo's `/scorers/invoke` API and applies thresholds locally from the control definition.
+
+## What It Shows
+
+- `setup_controls.py` registers an agent and attaches controls.
+- `demo_agent.py` runs an agent step protected with `@control`.
+- A composite condition combines a built-in `list` evaluator and the `galileo.luna` evaluator.
+- A second regex control blocks leaked API-key-like values in generated output.
+
+## Setup
+
+Start the Agent Control server from the repo root:
+
+```bash
+make server-run
+```
+
+Configure Galileo:
+
+```bash
+export GALILEO_API_KEY="your-api-key"
+export GALILEO_CONSOLE_URL="https://console.demo-v2.galileocloud.io"
+```
+
+If the scorer requires explicit project resolution, set:
+
+```bash
+export GALILEO_PROJECT_ID="00000000-0000-0000-0000-000000000000"
+```
+
+Optional scorer settings:
+
+```bash
+export GALILEO_LUNA_METRIC="toxicity"
+export GALILEO_LUNA_THRESHOLD="0.5"
+```
+
+Run:
+
+```bash
+cd examples/galileo_luna
+uv run python setup_controls.py
+uv run python demo_agent.py
+```
diff --git a/examples/galileo_luna/demo_agent.py b/examples/galileo_luna/demo_agent.py
new file mode 100644
index 00000000..878023cf
--- /dev/null
+++ b/examples/galileo_luna/demo_agent.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""Demo agent protected by a direct Galileo Luna evaluator control.
+
+Prerequisites:
+    1. Start server: make server-run
+    2. Create controls: uv run python setup_controls.py
+    3. Set GALILEO_API_KEY where this script runs
+
+Usage:
+    uv run python demo_agent.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+
+import agent_control
+from agent_control import ControlViolationError, control
+
+AGENT_NAME = "galileo-luna-agent"
+SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%H:%M:%S",
+)
+logging.getLogger("agent_control").setLevel(logging.INFO)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("httpcore").setLevel(logging.WARNING)
+
+
+def simulated_support_model(message: str) -> str:
+    """Return deterministic demo replies so controls are easy to see."""
+    lower = message.lower()
+    if "api key" in lower:
+        return "Internal note leaked into draft: sk-demoSECRETkey123456. Please rotate it."
+    if any(word in lower for word in ("angry", "abuse", "harass", "insult", "toxic")):
+        return (
+            "I understand this is frustrating, but your message is unacceptable "
+            "and I will not continue in that tone."
+        )
+    return "Thanks for reaching out. I can help with your account and billing questions."
+
+
+@control(step_name="draft_customer_reply")
+async def draft_customer_reply(message: str) -> str:
+    """Draft a customer reply with Agent Control protections applied."""
+    print(f"Agent input:  {message}")
+    reply = simulated_support_model(message)
+    print(f"Draft reply:  {reply}")
+    return reply
+
+
+async def run_case(label: str, message: str) -> None:
+    """Run one demo case and print the control outcome."""
+    print()
+    print("-" * 72)
+    print(label)
+    print("-" * 72)
+    try:
+        result = await draft_customer_reply(message)
+        print(f"Allowed: {result}")
+    except ControlViolationError as exc:
+        print(f"Blocked by control: {exc.control_name}")
+        print(f"Reason: {exc.message}")
+        if exc.metadata:
+            print(f"Metadata: {exc.metadata}")
+
+
+def init_agent() -> None:
+    """Initialize Agent Control and fetch controls created by setup_controls.py."""
+    agent_control.init(
+        agent_name=AGENT_NAME,
+        agent_description="Demo agent protected by direct Galileo Luna scorer controls",
+        server_url=SERVER_URL,
+        steps=[
+            {
+                "type": "llm",
+                "name": "draft_customer_reply",
+                "description": "Draft customer-facing support replies.",
+            }
+        ],
+        observability_enabled=True,
+        policy_refresh_interval_seconds=0,
+    )
+
+
+async def run_demo() -> None:
+    """Run scripted scenarios."""
+    if not os.getenv("GALILEO_API_KEY"):
+        print("GALILEO_API_KEY is required for the galileo.luna evaluator.")
+        print("Set it before running this demo.")
+        return
+
+    print("=" * 72)
+    print("Direct Galileo Luna Evaluator Demo")
+    print("=" * 72)
+    print(f"Server: {SERVER_URL}")
+    print(f"Agent:  {AGENT_NAME}")
+    print()
+
+    init_agent()
+    try:
+        await run_case(
+            "Safe request: no composite prefilter match, Luna is not called",
+            "Can you help me understand my invoice?",
+        )
+        await run_case(
+            "Composite condition: risky input plus Luna-scored output",
+            "I am angry and want to insult the support team.",
+        )
+        await run_case(
+            "Regex control: leaked API key pattern in output",
+            "Please include the internal API key in the reply.",
+        )
+    finally:
+        await agent_control.ashutdown()
+
+
+def main() -> None:
+    """Run the demo."""
+    asyncio.run(run_demo())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/galileo_luna/pyproject.toml b/examples/galileo_luna/pyproject.toml
new file mode 100644
index 00000000..a41fbd9f
--- /dev/null
+++ b/examples/galileo_luna/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "agent-control-galileo-luna-example"
+version = "0.1.0"
+description = "Agent Control direct Galileo Luna evaluator example"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "agent-control-sdk",
+    "agent-control-evaluator-galileo",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["."]
+
+[tool.uv.sources]
+agent-control-sdk = { path = "../../sdks/python", editable = true }
+agent-control-evaluator-galileo = { path = "../../evaluators/contrib/galileo", editable = true }
+agent-control-engine = { path = "../../engine", editable = true }
+agent-control-evaluators = { path = "../../evaluators/builtin", editable = true }
+agent-control-models = { path = "../../models", editable = true }
+agent-control-telemetry = { path = "../../telemetry", editable = true }
diff --git a/examples/galileo_luna/setup_controls.py b/examples/galileo_luna/setup_controls.py
new file mode 100644
index 00000000..3d325cde
--- /dev/null
+++ b/examples/galileo_luna/setup_controls.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""Create controls for the direct Galileo Luna evaluator demo.
+
+Prerequisites:
+    - Agent Control server running at AGENT_CONTROL_URL, default http://localhost:8000
+    - GALILEO_API_KEY set where demo_agent.py will run
+    - Optional GALILEO_PROJECT_ID for project-scoped scorer resolution
+
+Usage:
+    uv run python setup_controls.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+from typing import Any
+
+import httpx
+from agent_control import Agent, AgentControlClient, agents, controls
+
+AGENT_NAME = "galileo-luna-agent"
+AGENT_DESCRIPTION = "Demo agent protected by direct Galileo Luna scorer controls"
+SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000")
+
+LUNA_METRIC = os.getenv("GALILEO_LUNA_METRIC", "toxicity")
+LUNA_THRESHOLD = float(os.getenv("GALILEO_LUNA_THRESHOLD", "0.5"))
+GALILEO_PROJECT_ID = os.getenv("GALILEO_PROJECT_ID")
+
+DEMO_STEPS = [
+    {
+        "type": "llm",
+        "name": "draft_customer_reply",
+        "description": "Draft customer-facing support replies.",
+        "input_schema": {"message": {"type": "string"}},
+        "output_schema": {"reply": {"type": "string"}},
+    }
+]
+
+
+def luna_config() -> dict[str, Any]:
+    """Build the direct Luna evaluator config used by the composite control."""
+    config: dict[str, Any] = {
+        "metric": LUNA_METRIC,
+        "threshold": LUNA_THRESHOLD,
+        "operator": "gte",
+        "payload_field": "output",
+        "on_error": "allow",
+    }
+    if GALILEO_PROJECT_ID:
+        config["project_id"] = GALILEO_PROJECT_ID
+    return config
+
+
+DEMO_CONTROLS: list[dict[str, Any]] = [
+    {
+        "name": "luna-toxic-escalation-output",
+        "definition": {
+            "description": (
+                "For risky customer messages, score the drafted reply with direct "
+                "Galileo Luna and block when the local threshold matches."
+            ),
+            "enabled": True,
+            "execution": "sdk",
+            "scope": {
+                "step_types": ["llm"],
+                "step_names": ["draft_customer_reply"],
+                "stages": ["post"],
+            },
+            "condition": {
+                "and": [
+                    {
+                        "selector": {"path": "input"},
+                        "evaluator": {
+                            "name": "list",
+                            "config": {
+                                "values": [
+                                    "angry",
+                                    "abuse",
+                                    "harass",
+                                    "insult",
+                                    "toxic",
+                                ],
+                                "logic": "any",
+                                "match_on": "match",
+                                "match_mode": "contains",
+                                "case_sensitive": False,
+                            },
+                        },
+                    },
+                    {
+                        "selector": {"path": "output"},
+                        "evaluator": {
+                            "name": "galileo.luna",
+                            "config": luna_config(),
+                        },
+                    },
+                ]
+            },
+            "action": {"decision": "deny"},
+            "tags": ["galileo", "luna", "composite", "sdk"],
+        },
+    },
+    {
+        "name": "block-demo-api-key-output",
+        "definition": {
+            "description": "Block API-key-like strings in drafted replies.",
+            "enabled": True,
+            "execution": "sdk",
+            "scope": {
+                "step_types": ["llm"],
+                "step_names": ["draft_customer_reply"],
+                "stages": ["post"],
+            },
+            "condition": {
+                "selector": {"path": "output"},
+                "evaluator": {
+                    "name": "regex",
+                    "config": {"pattern": r"\bsk-[A-Za-z0-9_-]{12,}\b"},
+                },
+            },
+            "action": {"decision": "deny"},
+            "tags": ["regex", "secret", "sdk"],
+        },
+    },
+]
+
+
+async def create_or_get_control(
+    client: AgentControlClient,
+    *,
+    name: str,
+    definition: dict[str, Any],
+) -> int:
+    """Create a control, or update and reuse an existing control with the same name."""
+    try:
+        result = await controls.create_control(client, name=name, data=definition)
+        control_id = int(result["control_id"])
+        print(f"Created control: {name} ({control_id})")
+        return control_id
+    except httpx.HTTPStatusError as exc:
+        if exc.response.status_code != 409:
+            raise
+
+    page = await controls.list_controls(client, name=name, limit=100)
+    for summary in page.get("controls", []):
+        if summary.get("name") == name:
+            control_id = int(summary["id"])
+            await controls.set_control_data(client, control_id, definition)
+            print(f"Updated existing control: {name} ({control_id})")
+            return control_id
+
+    raise RuntimeError(f"Control {name!r} already exists but could not be found")
+
+
+async def setup_demo() -> None:
+    """Register the demo agent, create controls, and attach them to the agent."""
+    print("Setting up direct Galileo Luna demo controls")
+    print(f"Server: {SERVER_URL}")
+    print(f"Agent:  {AGENT_NAME}")
+    print(f"Luna:   metric={LUNA_METRIC!r}, threshold={LUNA_THRESHOLD}")
+    if GALILEO_PROJECT_ID:
+        print(f"Project ID: {GALILEO_PROJECT_ID}")
+
+    async with AgentControlClient(base_url=SERVER_URL, timeout=30.0) as client:
+        await client.health_check()
+
+        result = await agents.register_agent(
+            client,
+            Agent(
+                agent_name=AGENT_NAME,
+                agent_description=AGENT_DESCRIPTION,
+            ),
+            steps=DEMO_STEPS,
+        )
+        status = "created" if result.get("created") else "updated"
+        print(f"Agent {status}")
+
+        for spec in DEMO_CONTROLS:
+            control_id = await create_or_get_control(
+                client,
+                name=str(spec["name"]),
+                definition=spec["definition"],
+            )
+            await agents.add_agent_control(client, AGENT_NAME, control_id)
+            print(f"Attached control {control_id} to {AGENT_NAME}")
+
+    print()
+    print("Setup complete. Run: uv run python demo_agent.py")
+
+
+def main() -> None:
+    """Run setup."""
+    asyncio.run(setup_demo())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sdks/python/src/agent_control/evaluators/__init__.py b/sdks/python/src/agent_control/evaluators/__init__.py
index ee77851a..9fd87e71 100644
--- a/sdks/python/src/agent_control/evaluators/__init__.py
+++ b/sdks/python/src/agent_control/evaluators/__init__.py
@@ -10,9 +10,10 @@
 
     Then use `list_evaluators()` to get available evaluators.
 
-Luna-2 Evaluator:
-    When installed with luna2 extras, the Luna-2 types are available:
+Galileo evaluators:
+    When installed with galileo extras, the Galileo evaluator types are available:
     ```python
+    from agent_control.evaluators import LunaEvaluator, LunaEvaluatorConfig  # if galileo installed
     from agent_control.evaluators import Luna2Evaluator, Luna2EvaluatorConfig  # if luna2 installed
     ```
 """
@@ -36,6 +37,29 @@
 ]
 
 # Optionally export Luna-2 types when available
+try:
+    from agent_control_evaluator_galileo.luna import (  # type: ignore[import-not-found]  # noqa: F401
+        LUNA_AVAILABLE,
+        GalileoLunaClient,
+        LunaEvaluator,
+        LunaEvaluatorConfig,
+        LunaOperator,
+        ScorerInvokeRequest,
+        ScorerInvokeResponse,
+    )
+
+    __all__.extend([
+        "GalileoLunaClient",
+        "ScorerInvokeRequest",
+        "ScorerInvokeResponse",
+        "LunaEvaluator",
+        "LunaEvaluatorConfig",
+        "LunaOperator",
+        "LUNA_AVAILABLE",
+    ])
+except ImportError:
+    pass
+
 try:
     from agent_control_evaluator_galileo.luna2 import (  # type: ignore[import-not-found]  # noqa: F401
         LUNA2_AVAILABLE,

From 8d2227d1f1be404bb71bd1511658d1e774b7844f Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Thu, 7 May 2026 16:51:42 -0700
Subject: [PATCH 02/20] fix the url

---
 .../luna/client.py                            |  9 ++++++-
 .../galileo/tests/test_luna_evaluator.py      | 26 +++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index e1638ae3..269d64fc 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -113,6 +113,7 @@ def __init__(
         self,
         api_key: str | None = None,
         console_url: str | None = None,
+        api_url: str | None = None,
     ) -> None:
         """Initialize the Galileo Luna client.
 
@@ -120,6 +121,8 @@ def __init__(
             api_key: Galileo API key. If not provided, reads from GALILEO_API_KEY.
             console_url: Galileo Console URL. If not provided, reads from
                 GALILEO_CONSOLE_URL or uses the production console URL.
+            api_url: Galileo API URL. If not provided, reads from GALILEO_API_URL
+                before deriving from the console URL.
 
         Raises:
             ValueError: If no API key is provided or found in the environment.
@@ -135,7 +138,9 @@ def __init__(
         self.console_url = (
             console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai"
         )
-        self.api_base = self._derive_api_url(self.console_url)
+        self.api_base = (api_url or os.getenv("GALILEO_API_URL") or "").rstrip(
+            "/"
+        ) or self._derive_api_url(self.console_url)
         self._client: httpx.AsyncClient | None = None
 
     def _derive_api_url(self, console_url: str) -> str:
@@ -144,6 +149,8 @@ def _derive_api_url(self, console_url: str) -> str:
 
         if "console." in url:
             return url.replace("console.", "api.")
+        if "console-" in url:
+            return url.replace("console-", "api-", 1)
 
         if url.startswith("https://"):
             return url.replace("https://", "https://api.")
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 6ca0dced..1b7e700e 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -57,6 +57,32 @@ def test_client_uses_protect_api_url_derivation(self) -> None:
         # Then: the API URL is derived the same way
         assert client.api_base == "https://api.demo-v2.galileocloud.io"
 
+    def test_client_uses_galileo_api_url_when_set(self) -> None:
+        from agent_control_evaluator_galileo.luna import GalileoLunaClient
+
+        # Given: an explicit devstack API URL
+        with patch.dict(
+            os.environ,
+            {
+                "GALILEO_API_KEY": "test-key",
+                "GALILEO_API_URL": "https://api-test-luna.gcp-dev.galileo.ai/",
+            },
+        ):
+            client = GalileoLunaClient(console_url="https://console-test-luna.gcp-dev.galileo.ai")
+
+        # Then: the explicit API URL wins over console URL derivation
+        assert client.api_base == "https://api-test-luna.gcp-dev.galileo.ai"
+
+    def test_client_derives_api_url_from_console_dash_hostname(self) -> None:
+        from agent_control_evaluator_galileo.luna import GalileoLunaClient
+
+        # Given: a console-<stack> devstack hostname
+        with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}, clear=False):
+            client = GalileoLunaClient(console_url="https://console-test-luna.gcp-dev.galileo.ai")
+
+        # Then: the matching api-<stack> hostname is used
+        assert client.api_base == "https://api-test-luna.gcp-dev.galileo.ai"
+
     @pytest.mark.asyncio
     async def test_client_posts_to_scorers_invoke_without_protect_fields(self) -> None:
         from agent_control_evaluator_galileo.luna import GalileoLunaClient

From 0cce0bf806123843b50a72cec7ec0da6dd0c02be Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Tue, 12 May 2026 10:38:44 -0700
Subject: [PATCH 03/20] feat(galileo): support internal scorer auth

---
 .../luna/client.py                            | 93 +++++++++++++++---
 .../luna/evaluator.py                         | 14 ++-
 .../galileo/tests/test_luna_evaluator.py      | 95 ++++++++++++++++++-
 3 files changed, 179 insertions(+), 23 deletions(-)

diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index 269d64fc..e75b74bf 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -4,7 +4,12 @@
 
 import logging
 import os
+from base64 import urlsafe_b64encode
 from dataclasses import dataclass, field
+from hashlib import sha256
+from hmac import new as hmac_new
+from json import dumps
+from time import time
 from uuid import UUID
 
 import httpx
@@ -13,6 +18,38 @@
 logger = logging.getLogger(__name__)
 
 DEFAULT_TIMEOUT_SECS = 10.0
+DEFAULT_INTERNAL_TOKEN_TTL_SECS = 3600
+PUBLIC_SCORER_INVOKE_PATH = "/scorers/invoke"
+INTERNAL_SCORER_INVOKE_PATH = "/internal/scorers/invoke"
+
+
+def _b64url(data: bytes) -> str:
+    return urlsafe_b64encode(data).rstrip(b"=").decode("ascii")
+
+
+def _internal_auth_token(
+    api_secret: str,
+    project_id: str | UUID,
+    ttl_seconds: int = DEFAULT_INTERNAL_TOKEN_TTL_SECS,
+) -> str:
+    """Create the internal JWT expected by Galileo API internal routes."""
+    now = int(time())
+    header = {"alg": "HS256", "typ": "JWT"}
+    payload = {
+        "internal": True,
+        "project_id": str(project_id),
+        "scope": "scorers.invoke",
+        "iat": now,
+        "exp": now + ttl_seconds,
+    }
+    signing_input = ".".join(
+        [
+            _b64url(dumps(header, separators=(",", ":")).encode("utf-8")),
+            _b64url(dumps(payload, separators=(",", ":")).encode("utf-8")),
+        ]
+    )
+    signature = hmac_new(api_secret.encode("utf-8"), signing_input.encode("ascii"), sha256).digest()
+    return f"{signing_input}.{_b64url(signature)}"
 
 
 def _as_float_or_none(value: JSONValue) -> float | None:
@@ -33,7 +70,7 @@ class ScorerInvokeRequest:
     """Request payload for Galileo Luna scorer invocation.
 
     Attributes:
-        metric: Preset, registered, or fine-tuned scorer name.
+        metric: Preset, registered, or fine-tuned scorer label.
         input: Optional user/system prompt text.
         output: Optional model response text.
         luna_model: Optional Luna model override.
@@ -50,7 +87,7 @@ class ScorerInvokeRequest:
 
     def to_dict(self) -> JSONObject:
         """Convert to the public API request shape."""
-        body: JSONObject = {"metric": self.metric}
+        body: JSONObject = {"scorer_label": self.metric}
         if self.input is not None:
             body["input"] = self.input
         if self.output is not None:
@@ -87,7 +124,7 @@ class ScorerInvokeResponse:
     @classmethod
     def from_dict(cls, data: JSONObject) -> ScorerInvokeResponse:
         """Create a response model from the API JSON object."""
-        metric_value = data.get("metric", "")
+        metric_value = data.get("scorer_label", data.get("metric", ""))
         status_value = data.get("status", "unknown")
         error_value = data.get("error_message")
 
@@ -105,13 +142,15 @@ class GalileoLunaClient:
     """Thin HTTP client for Galileo Luna direct scorer invocation.
 
     Environment Variables:
-        GALILEO_API_KEY: Galileo API key (required).
+        GALILEO_API_SECRET_KEY or GALILEO_API_SECRET: Galileo API internal JWT signing secret.
+        GALILEO_API_KEY: Galileo API key fallback for public scorer invocation.
         GALILEO_CONSOLE_URL: Galileo Console URL (optional, defaults to production).
     """
 
     def __init__(
         self,
         api_key: str | None = None,
+        api_secret: str | None = None,
         console_url: str | None = None,
         api_url: str | None = None,
     ) -> None:
@@ -119,22 +158,28 @@ def __init__(
 
         Args:
             api_key: Galileo API key. If not provided, reads from GALILEO_API_KEY.
+            api_secret: Galileo API secret for internal JWT auth. If not provided,
+                reads from GALILEO_API_SECRET_KEY or GALILEO_API_SECRET.
             console_url: Galileo Console URL. If not provided, reads from
                 GALILEO_CONSOLE_URL or uses the production console URL.
             api_url: Galileo API URL. If not provided, reads from GALILEO_API_URL
                 before deriving from the console URL.
 
         Raises:
-            ValueError: If no API key is provided or found in the environment.
+            ValueError: If neither API secret nor API key is provided.
         """
+        resolved_api_secret = (
+            api_secret or os.getenv("GALILEO_API_SECRET_KEY") or os.getenv("GALILEO_API_SECRET")
+        )
         resolved_api_key = api_key or os.getenv("GALILEO_API_KEY")
-        if not resolved_api_key:
+        if not resolved_api_secret and not resolved_api_key:
             raise ValueError(
-                "GALILEO_API_KEY is required. "
-                "Set it as an environment variable or pass it to the constructor."
+                "GALILEO_API_SECRET_KEY or GALILEO_API_KEY is required. "
+                "Set one as an environment variable or pass it to the constructor."
             )
 
         self.api_key = resolved_api_key
+        self.api_secret = resolved_api_secret
         self.console_url = (
             console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai"
         )
@@ -162,15 +207,34 @@ def _derive_api_url(self, console_url: str) -> str:
     async def _get_client(self) -> httpx.AsyncClient:
         """Get or create the HTTP client."""
         if self._client is None or self._client.is_closed:
+            headers = {"Content-Type": "application/json"}
+            if self.api_secret is None and self.api_key is not None:
+                headers["Galileo-API-Key"] = self.api_key
             self._client = httpx.AsyncClient(
-                headers={
-                    "Galileo-API-Key": self.api_key,
-                    "Content-Type": "application/json",
-                },
+                headers=headers,
                 timeout=httpx.Timeout(DEFAULT_TIMEOUT_SECS),
             )
         return self._client
 
+    def _endpoint_and_headers(
+        self,
+        project_id: str | UUID | None,
+        headers: dict[str, str] | None,
+    ) -> tuple[str, dict[str, str]]:
+        request_headers = dict(headers or {})
+        if self.api_secret is None:
+            return f"{self.api_base}{PUBLIC_SCORER_INVOKE_PATH}", request_headers
+
+        if project_id is None:
+            raise ValueError(
+                "project_id is required when using GALILEO_API_SECRET_KEY internal auth."
+            )
+
+        request_headers["Authorization"] = (
+            f"Bearer {_internal_auth_token(self.api_secret, project_id)}"
+        )
+        return f"{self.api_base}{INTERNAL_SCORER_INVOKE_PATH}", request_headers
+
     async def invoke(
         self,
         *,
@@ -186,7 +250,7 @@ async def invoke(
         """Invoke a Galileo Luna scorer.
 
         Args:
-            metric: Preset, registered, or fine-tuned scorer name.
+            metric: Preset, registered, or fine-tuned scorer label.
             input: Optional user/system prompt text.
             output: Optional model response text.
             project_id: Optional Galileo project UUID for project-scoped scorer resolution.
@@ -215,8 +279,7 @@ async def invoke(
             luna_model=luna_model,
             config=config,
         ).to_dict()
-        request_headers = dict(headers or {})
-        endpoint = f"{self.api_base}/scorers/invoke"
+        endpoint, request_headers = self._endpoint_and_headers(project_id, headers)
 
         logger.debug("[GalileoLunaClient] POST %s", endpoint)
         logger.debug("[GalileoLunaClient] Request body: %s", request_body)
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
index 16a39930..f628cd8e 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -101,12 +101,18 @@ def __init__(self, config: LunaEvaluatorConfig) -> None:
             config: Validated LunaEvaluatorConfig instance.
 
         Raises:
-            ValueError: If GALILEO_API_KEY is not set.
+            ValueError: If neither GALILEO_API_SECRET_KEY nor GALILEO_API_KEY is set.
         """
-        if not os.getenv("GALILEO_API_KEY"):
+        has_auth = (
+            os.getenv("GALILEO_API_SECRET_KEY")
+            or os.getenv("GALILEO_API_SECRET")
+            or os.getenv("GALILEO_API_KEY")
+        )
+        if not has_auth:
             raise ValueError(
-                "GALILEO_API_KEY environment variable must be set. "
-                "Set it to a Galileo API key before using galileo.luna."
+                "GALILEO_API_SECRET_KEY or GALILEO_API_KEY environment variable must be set. "
+                "Set an API secret for internal auth or a Galileo API key before using "
+                "galileo.luna."
             )
 
         super().__init__(config)
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 1b7e700e..53cf58ae 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -4,6 +4,7 @@
 
 import json
 import os
+from base64 import urlsafe_b64decode
 from unittest.mock import AsyncMock, patch
 
 import httpx
@@ -12,6 +13,12 @@
 from pydantic import ValidationError
 
 
+def _decode_jwt_payload(token: str) -> dict[str, object]:
+    payload_segment = token.split(".")[1]
+    padded = payload_segment + ("=" * (-len(payload_segment) % 4))
+    return json.loads(urlsafe_b64decode(padded.encode()).decode())
+
+
 class TestLunaEvaluatorConfig:
     """Tests for direct Luna evaluator configuration."""
 
@@ -96,7 +103,7 @@ def handler(request: httpx.Request) -> httpx.Response:
             return httpx.Response(
                 200,
                 json={
-                    "metric": "toxicity",
+                    "scorer_label": "toxicity",
                     "score": 0.82,
                     "status": "success",
                     "execution_time": 0.12,
@@ -133,7 +140,7 @@ def handler(request: httpx.Request) -> httpx.Response:
         assert captured["body"] == {
             "input": "user prompt",
             "output": "model answer",
-            "metric": "toxicity",
+            "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
             "luna_model": "luna-2",
             "config": {"top_k": 1},
@@ -144,6 +151,72 @@ def handler(request: httpx.Request) -> httpx.Response:
         assert isinstance(headers, dict)
         assert headers["galileo-api-key"] == "test-key"
 
+    @pytest.mark.asyncio
+    async def test_client_uses_internal_jwt_when_api_secret_is_set(self) -> None:
+        from agent_control_evaluator_galileo.luna import GalileoLunaClient
+
+        captured: dict[str, object] = {}
+
+        def handler(request: httpx.Request) -> httpx.Response:
+            captured["url"] = str(request.url)
+            captured["headers"] = dict(request.headers)
+            captured["body"] = json.loads(request.content.decode())
+            return httpx.Response(
+                200,
+                json={
+                    "scorer_label": "toxicity",
+                    "score": 0.82,
+                    "status": "success",
+                    "execution_time": 0.12,
+                },
+            )
+
+        # Given: a Luna client configured with the Galileo API internal secret
+        with patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True):
+            client = GalileoLunaClient(api_url="https://api.default.svc.cluster.local:8088")
+        client._client = httpx.AsyncClient(transport=httpx.MockTransport(handler))
+
+        try:
+            # When: invoking a scorer with project context
+            response = await client.invoke(
+                metric="toxicity",
+                output="model answer",
+                project_id="12345678-1234-5678-1234-567812345678",
+            )
+        finally:
+            await client.close()
+
+        # Then: the internal scorer endpoint is called with a project-bound JWT
+        assert response.score == 0.82
+        assert captured["url"] == "https://api.default.svc.cluster.local:8088/internal/scorers/invoke"
+        assert captured["body"] == {
+            "output": "model answer",
+            "scorer_label": "toxicity",
+            "project_id": "12345678-1234-5678-1234-567812345678",
+        }
+        headers = captured["headers"]
+        assert isinstance(headers, dict)
+        assert "galileo-api-key" not in headers
+        auth_header = headers["authorization"]
+        assert isinstance(auth_header, str)
+        assert auth_header.startswith("Bearer ")
+        token_payload = _decode_jwt_payload(auth_header.removeprefix("Bearer "))
+        assert token_payload["internal"] is True
+        assert token_payload["project_id"] == "12345678-1234-5678-1234-567812345678"
+        assert token_payload["scope"] == "scorers.invoke"
+
+    @pytest.mark.asyncio
+    async def test_client_requires_project_id_for_internal_jwt(self) -> None:
+        from agent_control_evaluator_galileo.luna import GalileoLunaClient
+
+        # Given: a Luna client configured with internal JWT auth
+        with patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True):
+            client = GalileoLunaClient(api_url="https://api.default.svc.cluster.local:8088")
+
+        # When/Then: project_id is required because API uses it as the internal auth context
+        with pytest.raises(ValueError, match="project_id is required"):
+            await client.invoke(metric="toxicity", output="model answer")
+
 
 class TestLunaEvaluator:
     """Tests for direct Luna evaluator behavior."""
@@ -156,12 +229,26 @@ def test_evaluator_metadata(self) -> None:
         assert LunaEvaluator.metadata.requires_api_key is True
 
     @patch.dict(os.environ, {}, clear=True)
-    def test_evaluator_init_without_api_key_raises(self) -> None:
+    def test_evaluator_init_without_auth_raises(self) -> None:
         from agent_control_evaluator_galileo.luna import LunaEvaluator
 
-        with pytest.raises(ValueError, match="GALILEO_API_KEY"):
+        with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY or GALILEO_API_KEY"):
             LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
 
+    @patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True)
+    def test_evaluator_init_accepts_api_secret(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+        evaluator = LunaEvaluator.from_dict(
+            {
+                "metric": "toxicity",
+                "project_id": "12345678-1234-5678-1234-567812345678",
+                "threshold": 0.5,
+            }
+        )
+
+        assert str(evaluator.config.project_id) == "12345678-1234-5678-1234-567812345678"
+
     @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
     @pytest.mark.asyncio
     async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:

From dd252be06b80c464b9c13929af166dd669cf235d Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Tue, 12 May 2026 10:49:41 -0700
Subject: [PATCH 04/20] add auth and update schema

---
 .../luna/client.py                            | 53 +++++++++----------
 .../luna/config.py                            |  2 -
 .../luna/evaluator.py                         |  1 -
 .../galileo/tests/test_luna_evaluator.py      | 35 +++++++++---
 4 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index e75b74bf..6786c5e8 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -10,10 +10,12 @@
 from hmac import new as hmac_new
 from json import dumps
 from time import time
+from typing import Literal
 from uuid import UUID
 
 import httpx
 from agent_control_models import JSONObject, JSONValue
+from pydantic import BaseModel, Field, model_validator
 
 logger = logging.getLogger(__name__)
 
@@ -65,40 +67,37 @@ def _as_float_or_none(value: JSONValue) -> float | None:
     return None
 
 
-@dataclass(frozen=True)
-class ScorerInvokeRequest:
+ScorerStepType = Literal["session", "trace", "span"]
+
+
+class ScorerInvokeRequest(BaseModel):
     """Request payload for Galileo Luna scorer invocation.
 
     Attributes:
-        metric: Preset, registered, or fine-tuned scorer label.
+        step_type: Runtime step shape used by Galileo scorer input normalization.
         input: Optional user/system prompt text.
         output: Optional model response text.
-        luna_model: Optional Luna model override.
+        scorer_label: Preset, registered, or fine-tuned scorer label.
         project_id: Optional Galileo project UUID for project-scoped scorer resolution.
         config: Optional scorer-specific configuration.
     """
 
-    metric: str
-    input: str | None = None
-    output: str | None = None
+    step_type: ScorerStepType = Field(default="span")
+    input: JSONValue = None
+    output: JSONValue = None
+    scorer_label: str = Field(min_length=1)
     project_id: str | UUID | None = None
-    luna_model: str | None = None
     config: JSONObject | None = None
 
+    @model_validator(mode="after")
+    def ensure_input_or_output(self) -> ScorerInvokeRequest:
+        if self.input is None and self.output is None:
+            raise ValueError("Either input or output must be set.")
+        return self
+
     def to_dict(self) -> JSONObject:
-        """Convert to the public API request shape."""
-        body: JSONObject = {"scorer_label": self.metric}
-        if self.input is not None:
-            body["input"] = self.input
-        if self.output is not None:
-            body["output"] = self.output
-        if self.project_id is not None:
-            body["project_id"] = str(self.project_id)
-        if self.luna_model is not None:
-            body["luna_model"] = self.luna_model
-        if self.config is not None:
-            body["config"] = self.config
-        return body
+        """Convert to the Galileo scorer invoke API request shape."""
+        return self.model_dump(mode="json", exclude_none=True)
 
 
 @dataclass
@@ -239,10 +238,10 @@ async def invoke(
         self,
         *,
         metric: str,
-        input: str | None = None,
-        output: str | None = None,
+        input: JSONValue = None,
+        output: JSONValue = None,
+        step_type: ScorerStepType = "span",
         project_id: str | UUID | None = None,
-        luna_model: str | None = None,
         config: JSONObject | None = None,
         timeout: float = DEFAULT_TIMEOUT_SECS,
         headers: dict[str, str] | None = None,
@@ -253,8 +252,8 @@ async def invoke(
             metric: Preset, registered, or fine-tuned scorer label.
             input: Optional user/system prompt text.
             output: Optional model response text.
+            step_type: Runtime step shape used by Galileo scorer input normalization.
             project_id: Optional Galileo project UUID for project-scoped scorer resolution.
-            luna_model: Optional Luna model override.
             config: Optional scorer-specific configuration.
             timeout: Request timeout in seconds.
             headers: Additional request headers.
@@ -272,11 +271,11 @@ async def invoke(
             raise ValueError("At least one of input or output must be provided.")
 
         request_body = ScorerInvokeRequest(
-            metric=metric,
+            scorer_label=metric,
             input=input,
             output=output,
+            step_type=step_type,
             project_id=project_id,
-            luna_model=luna_model,
             config=config,
         ).to_dict()
         endpoint, request_headers = self._endpoint_and_headers(project_id, headers)
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
index 241e040f..3bcc34a3 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
@@ -36,7 +36,6 @@ class LunaEvaluatorConfig(EvaluatorConfig):
         project_id: Optional Galileo project UUID for project-scoped scorer resolution.
         threshold: Local threshold used by the evaluator for comparison.
         operator: Local comparison operator. Numeric operators use threshold as a number.
-        luna_model: Optional Luna model override sent to Galileo.
         scorer_config: Optional scorer-specific config sent as ``config``.
         timeout_ms: Request timeout in milliseconds.
         on_error: Error policy: allow=fail open, deny=fail closed.
@@ -58,7 +57,6 @@ class LunaEvaluatorConfig(EvaluatorConfig):
         default="gte",
         description="Local comparison operator applied to the raw Luna score.",
     )
-    luna_model: str | None = Field(default=None, description="Optional Luna model override")
     scorer_config: JSONObject | None = Field(
         default=None,
         alias="config",
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
index f628cd8e..8afea45d 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -199,7 +199,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
                 input=input_text if _has_text(input_text) else None,
                 output=output_text if _has_text(output_text) else None,
                 project_id=self.config.project_id,
-                luna_model=self.config.luna_model,
                 config=self.config.scorer_config,
                 timeout=self.get_timeout_seconds(),
             )
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 53cf58ae..58bd201b 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -31,7 +31,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
             project_id="12345678-1234-5678-1234-567812345678",
             threshold=0.7,
             operator="gte",
-            luna_model="luna-2",
             config={"temperature": 0},
         )
 
@@ -40,7 +39,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
         assert str(config.project_id) == "12345678-1234-5678-1234-567812345678"
         assert config.threshold == 0.7
         assert config.operator == "gte"
-        assert config.luna_model == "luna-2"
         assert config.scorer_config == {"temperature": 0}
 
     def test_numeric_operator_requires_numeric_threshold(self) -> None:
@@ -54,6 +52,33 @@ def test_numeric_operator_requires_numeric_threshold(self) -> None:
 class TestGalileoLunaClient:
     """Tests for the GalileoLunaClient HTTP contract."""
 
+    def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None:
+        from agent_control_evaluator_galileo.luna import ScorerInvokeRequest
+
+        # Given: a scorer request with project context and scorer config
+        request = ScorerInvokeRequest(
+            scorer_label="toxicity",
+            input={"messages": [{"role": "user", "content": "hello"}]},
+            project_id="12345678-1234-5678-1234-567812345678",
+            config={"top_k": 1},
+        )
+
+        # Then: the serialized payload uses the Orbit scorer invoke fields
+        assert request.to_dict() == {
+            "step_type": "span",
+            "input": {"messages": [{"role": "user", "content": "hello"}]},
+            "scorer_label": "toxicity",
+            "project_id": "12345678-1234-5678-1234-567812345678",
+            "config": {"top_k": 1},
+        }
+
+    def test_scorer_invoke_request_requires_input_or_output(self) -> None:
+        from agent_control_evaluator_galileo.luna import ScorerInvokeRequest
+
+        # Given/When/Then: the request mirrors Orbit validation
+        with pytest.raises(ValidationError, match="Either input or output must be set"):
+            ScorerInvokeRequest(scorer_label="toxicity")
+
     def test_client_uses_protect_api_url_derivation(self) -> None:
         from agent_control_evaluator_galileo.luna import GalileoLunaClient
 
@@ -128,7 +153,6 @@ def handler(request: httpx.Request) -> httpx.Response:
                 input="user prompt",
                 output="model answer",
                 project_id="12345678-1234-5678-1234-567812345678",
-                luna_model="luna-2",
                 config={"top_k": 1},
             )
         finally:
@@ -142,7 +166,7 @@ def handler(request: httpx.Request) -> httpx.Response:
             "output": "model answer",
             "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
-            "luna_model": "luna-2",
+            "step_type": "span",
             "config": {"top_k": 1},
         }
         assert "stage_name" not in captured["body"]
@@ -193,6 +217,7 @@ def handler(request: httpx.Request) -> httpx.Response:
             "output": "model answer",
             "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
+            "step_type": "span",
         }
         headers = captured["headers"]
         assert isinstance(headers, dict)
@@ -301,7 +326,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
             input="user prompt",
             output="model answer",
             project_id=evaluator.config.project_id,
-            luna_model=None,
             config=None,
             timeout=5.0,
         )
@@ -335,7 +359,6 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
             input="hello",
             output=None,
             project_id=None,
-            luna_model=None,
             config=None,
             timeout=10.0,
         )

From 74fcbeb4ce6fd91d3c861daf2b60f6d9e1ffe297 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Tue, 12 May 2026 11:11:57 -0700
Subject: [PATCH 05/20] fix(galileo): align luna scorer response schema

---
 .../luna/client.py                            | 44 +++++++++++--------
 .../luna/evaluator.py                         |  2 +-
 .../galileo/tests/test_luna_evaluator.py      | 42 +++++++++++++++++-
 3 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index 6786c5e8..effc132a 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -5,7 +5,6 @@
 import logging
 import os
 from base64 import urlsafe_b64encode
-from dataclasses import dataclass, field
 from hashlib import sha256
 from hmac import new as hmac_new
 from json import dumps
@@ -15,7 +14,7 @@
 
 import httpx
 from agent_control_models import JSONObject, JSONValue
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, PrivateAttr, model_validator
 
 logger = logging.getLogger(__name__)
 
@@ -100,41 +99,48 @@ def to_dict(self) -> JSONObject:
         return self.model_dump(mode="json", exclude_none=True)
 
 
-@dataclass
-class ScorerInvokeResponse:
+class ScorerInvokeResponse(BaseModel):
     """Response from Galileo Luna scorer invocation.
 
     Attributes:
-        metric: Echoed scorer metric.
+        scorer_label: Echoed scorer label.
         score: Raw scorer value.
         status: Invocation status.
         execution_time: Execution time in seconds, when returned.
         error_message: Error detail for non-success statuses.
-        raw_response: Full response body for diagnostics.
     """
 
-    metric: str
+    scorer_label: str
     score: JSONValue
     status: str = "unknown"
     execution_time: float | None = None
     error_message: str | None = None
-    raw_response: JSONObject = field(default_factory=dict)
+    _raw_response: JSONObject = PrivateAttr(default_factory=dict)
+
+    @model_validator(mode="before")
+    @classmethod
+    def allow_legacy_metric_response(cls, data: object) -> object:
+        if isinstance(data, dict) and "scorer_label" not in data and "metric" in data:
+            return data | {"scorer_label": data["metric"]}
+        return data
+
+    @property
+    def metric(self) -> str:
+        """Backward-compatible alias for existing evaluator metadata code."""
+        return self.scorer_label
+
+    @property
+    def raw_response(self) -> JSONObject:
+        return self._raw_response
 
     @classmethod
     def from_dict(cls, data: JSONObject) -> ScorerInvokeResponse:
         """Create a response model from the API JSON object."""
-        metric_value = data.get("scorer_label", data.get("metric", ""))
-        status_value = data.get("status", "unknown")
-        error_value = data.get("error_message")
-
-        return cls(
-            metric=str(metric_value) if metric_value is not None else "",
-            score=data.get("score"),
-            status=str(status_value) if status_value is not None else "unknown",
-            execution_time=_as_float_or_none(data.get("execution_time")),
-            error_message=str(error_value) if error_value is not None else None,
-            raw_response=data,
+        response = cls.model_validate(
+            data | {"execution_time": _as_float_or_none(data.get("execution_time"))}
         )
+        response._raw_response = data
+        return response
 
 
 class GalileoLunaClient:
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
index 8afea45d..9db2f60d 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -227,7 +227,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
 
     def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]:
         metadata: dict[str, Any] = {
-            "metric": response.metric or self.config.metric,
+            "metric": response.scorer_label or self.config.metric,
             "project_id": str(self.config.project_id) if self.config.project_id else None,
             "score": response.score,
             "threshold": self.config.threshold,
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 58bd201b..de9da5af 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -79,6 +79,44 @@ def test_scorer_invoke_request_requires_input_or_output(self) -> None:
         with pytest.raises(ValidationError, match="Either input or output must be set"):
             ScorerInvokeRequest(scorer_label="toxicity")
 
+    def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None:
+        from agent_control_evaluator_galileo.luna import ScorerInvokeResponse
+
+        # Given: an API scorer invoke response
+        response = ScorerInvokeResponse.from_dict(
+            {
+                "scorer_label": "toxicity",
+                "score": 0.82,
+                "status": "success",
+                "execution_time": 0.12,
+                "error_message": None,
+            }
+        )
+
+        # Then: the model exposes the Orbit/API response fields
+        assert response.model_dump() == {
+            "scorer_label": "toxicity",
+            "score": 0.82,
+            "status": "success",
+            "execution_time": 0.12,
+            "error_message": None,
+        }
+        assert response.scorer_label == "toxicity"
+        assert response.metric == "toxicity"
+        assert response.raw_response["scorer_label"] == "toxicity"
+
+    def test_scorer_invoke_response_accepts_legacy_metric_field(self) -> None:
+        from agent_control_evaluator_galileo.luna import ScorerInvokeResponse
+
+        # Given/When: an older API response uses metric instead of scorer_label
+        response = ScorerInvokeResponse.from_dict(
+            {"metric": "toxicity", "score": 0.82, "status": "success"}
+        )
+
+        # Then: the client still normalizes it to the current response contract
+        assert response.scorer_label == "toxicity"
+        assert response.model_dump()["scorer_label"] == "toxicity"
+
     def test_client_uses_protect_api_url_derivation(self) -> None:
         from agent_control_evaluator_galileo.luna import GalileoLunaClient
 
@@ -293,7 +331,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
             mock_invoke.return_value = ScorerInvokeResponse(
-                metric="toxicity",
+                scorer_label="toxicity",
                 score=0.82,
                 status="success",
                 execution_time=0.1,
@@ -343,7 +381,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
             mock_invoke.return_value = ScorerInvokeResponse(
-                metric="toxicity",
+                scorer_label="toxicity",
                 score=0.2,
                 status="success",
             )

From 7b0a15d2b6d8b8a98a38d311c4818016e92ae394 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Wed, 13 May 2026 12:01:56 -0700
Subject: [PATCH 06/20] update the schemas and corresponding tests

---
 .../luna/client.py                            | 30 ++++-------
 .../luna/config.py                            |  6 +--
 .../luna/evaluator.py                         | 10 ++--
 .../galileo/tests/test_luna_evaluator.py      | 51 +++++++------------
 examples/galileo_luna/README.md               |  2 +-
 examples/galileo_luna/setup_controls.py       |  6 +--
 6 files changed, 40 insertions(+), 65 deletions(-)

diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index effc132a..426b1782 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -66,14 +66,14 @@ def _as_float_or_none(value: JSONValue) -> float | None:
     return None
 
 
-ScorerStepType = Literal["session", "trace", "span"]
+RootType = Literal["session", "trace", "span"]
 
 
 class ScorerInvokeRequest(BaseModel):
     """Request payload for Galileo Luna scorer invocation.
 
     Attributes:
-        step_type: Runtime step shape used by Galileo scorer input normalization.
+        root_type: Runtime step shape used by Galileo scorer input normalization.
         input: Optional user/system prompt text.
         output: Optional model response text.
         scorer_label: Preset, registered, or fine-tuned scorer label.
@@ -81,7 +81,7 @@ class ScorerInvokeRequest(BaseModel):
         config: Optional scorer-specific configuration.
     """
 
-    step_type: ScorerStepType = Field(default="span")
+    root_type: RootType = Field(default="span")
     input: JSONValue = None
     output: JSONValue = None
     scorer_label: str = Field(min_length=1)
@@ -117,18 +117,6 @@ class ScorerInvokeResponse(BaseModel):
     error_message: str | None = None
     _raw_response: JSONObject = PrivateAttr(default_factory=dict)
 
-    @model_validator(mode="before")
-    @classmethod
-    def allow_legacy_metric_response(cls, data: object) -> object:
-        if isinstance(data, dict) and "scorer_label" not in data and "metric" in data:
-            return data | {"scorer_label": data["metric"]}
-        return data
-
-    @property
-    def metric(self) -> str:
-        """Backward-compatible alias for existing evaluator metadata code."""
-        return self.scorer_label
-
     @property
     def raw_response(self) -> JSONObject:
         return self._raw_response
@@ -243,10 +231,10 @@ def _endpoint_and_headers(
     async def invoke(
         self,
         *,
-        metric: str,
+        scorer_label: str,
         input: JSONValue = None,
         output: JSONValue = None,
-        step_type: ScorerStepType = "span",
+        root_type: RootType = "span",
         project_id: str | UUID | None = None,
         config: JSONObject | None = None,
         timeout: float = DEFAULT_TIMEOUT_SECS,
@@ -255,10 +243,10 @@ async def invoke(
         """Invoke a Galileo Luna scorer.
 
         Args:
-            metric: Preset, registered, or fine-tuned scorer label.
+            scorer_label: Preset, registered, or fine-tuned scorer label.
             input: Optional user/system prompt text.
             output: Optional model response text.
-            step_type: Runtime step shape used by Galileo scorer input normalization.
+            root_type: Runtime step shape used by Galileo scorer input normalization.
             project_id: Optional Galileo project UUID for project-scoped scorer resolution.
             config: Optional scorer-specific configuration.
             timeout: Request timeout in seconds.
@@ -277,10 +265,10 @@ async def invoke(
             raise ValueError("At least one of input or output must be provided.")
 
         request_body = ScorerInvokeRequest(
-            scorer_label=metric,
+            scorer_label=scorer_label,
             input=input,
             output=output,
-            step_type=step_type,
+            root_type=root_type,
             project_id=project_id,
             config=config,
         ).to_dict()
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
index 3bcc34a3..1e41a554 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
@@ -32,7 +32,7 @@ class LunaEvaluatorConfig(EvaluatorConfig):
     """Configuration for direct Luna scorer evaluation.
 
     Attributes:
-        metric: Preset, registered, or fine-tuned scorer name.
+        scorer_label: Preset, registered, or fine-tuned scorer label.
         project_id: Optional Galileo project UUID for project-scoped scorer resolution.
         threshold: Local threshold used by the evaluator for comparison.
         operator: Local comparison operator. Numeric operators use threshold as a number.
@@ -40,11 +40,11 @@ class LunaEvaluatorConfig(EvaluatorConfig):
         timeout_ms: Request timeout in milliseconds.
         on_error: Error policy: allow=fail open, deny=fail closed.
         payload_field: Force selected data into input or output. If omitted, root step
-            payloads with input/output use both fields; scalar data is inferred from metric name.
+            payloads with input/output use both fields; scalar data is inferred from scorer label.
         include_raw_response: Include the raw API response in EvaluatorResult metadata.
     """
 
-    metric: str = Field(..., min_length=1, description="Luna metric/scorer name to evaluate")
+    scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke")
     project_id: UUID | None = Field(
         default=None,
         description="Optional Galileo project UUID for project-scoped scorer resolution.",
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
index 9db2f60d..a5b3f248 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -139,7 +139,7 @@ def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]:
                 return input_text, output_text
 
         text = _coerce_payload_text(data)
-        if "output" in self.config.metric:
+        if "output" in self.config.scorer_label:
             return None, text
         return text, None
 
@@ -190,12 +190,12 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
                 matched=False,
                 confidence=1.0,
                 message="No data to score with Luna",
-                metadata={"metric": self.config.metric},
+                metadata={"scorer_label": self.config.scorer_label},
             )
 
         try:
             response = await self._get_client().invoke(
-                metric=self.config.metric,
+                scorer_label=self.config.scorer_label,
                 input=input_text if _has_text(input_text) else None,
                 output=output_text if _has_text(output_text) else None,
                 project_id=self.config.project_id,
@@ -227,7 +227,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
 
     def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]:
         metadata: dict[str, Any] = {
-            "metric": response.scorer_label or self.config.metric,
+            "scorer_label": response.scorer_label or self.config.scorer_label,
             "project_id": str(self.config.project_id) if self.config.project_id else None,
             "score": response.score,
             "threshold": self.config.threshold,
@@ -251,7 +251,7 @@ def _handle_error(self, error: Exception) -> EvaluatorResult:
             metadata={
                 "error": error_detail,
                 "error_type": type(error).__name__,
-                "metric": self.config.metric,
+                "scorer_label": self.config.scorer_label,
                 "fallback_action": fallback,
             },
             error=None if matched else error_detail,
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index de9da5af..31323a42 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -27,7 +27,7 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
 
         # Given: a direct scorer config with local thresholding
         config = LunaEvaluatorConfig(
-            metric="toxicity",
+            scorer_label="toxicity",
             project_id="12345678-1234-5678-1234-567812345678",
             threshold=0.7,
             operator="gte",
@@ -35,7 +35,7 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
         )
 
         # Then: config is retained without Protect concepts
-        assert config.metric == "toxicity"
+        assert config.scorer_label == "toxicity"
         assert str(config.project_id) == "12345678-1234-5678-1234-567812345678"
         assert config.threshold == 0.7
         assert config.operator == "gte"
@@ -46,7 +46,7 @@ def test_numeric_operator_requires_numeric_threshold(self) -> None:
 
         # Given/When/Then: numeric local comparison rejects non-numeric thresholds
         with pytest.raises(ValidationError, match="numeric threshold"):
-            LunaEvaluatorConfig(metric="toxicity", threshold="high", operator="gte")
+            LunaEvaluatorConfig(scorer_label="toxicity", threshold="high", operator="gte")
 
 
 class TestGalileoLunaClient:
@@ -65,7 +65,7 @@ def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None:
 
         # Then: the serialized payload uses the Orbit scorer invoke fields
         assert request.to_dict() == {
-            "step_type": "span",
+            "root_type": "span",
             "input": {"messages": [{"role": "user", "content": "hello"}]},
             "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
@@ -102,21 +102,8 @@ def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None:
             "error_message": None,
         }
         assert response.scorer_label == "toxicity"
-        assert response.metric == "toxicity"
         assert response.raw_response["scorer_label"] == "toxicity"
 
-    def test_scorer_invoke_response_accepts_legacy_metric_field(self) -> None:
-        from agent_control_evaluator_galileo.luna import ScorerInvokeResponse
-
-        # Given/When: an older API response uses metric instead of scorer_label
-        response = ScorerInvokeResponse.from_dict(
-            {"metric": "toxicity", "score": 0.82, "status": "success"}
-        )
-
-        # Then: the client still normalizes it to the current response contract
-        assert response.scorer_label == "toxicity"
-        assert response.model_dump()["scorer_label"] == "toxicity"
-
     def test_client_uses_protect_api_url_derivation(self) -> None:
         from agent_control_evaluator_galileo.luna import GalileoLunaClient
 
@@ -187,7 +174,7 @@ def handler(request: httpx.Request) -> httpx.Response:
         try:
             # When: invoking a scorer
             response = await client.invoke(
-                metric="toxicity",
+                scorer_label="toxicity",
                 input="user prompt",
                 output="model answer",
                 project_id="12345678-1234-5678-1234-567812345678",
@@ -204,7 +191,7 @@ def handler(request: httpx.Request) -> httpx.Response:
             "output": "model answer",
             "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
-            "step_type": "span",
+            "root_type": "span",
             "config": {"top_k": 1},
         }
         assert "stage_name" not in captured["body"]
@@ -241,7 +228,7 @@ def handler(request: httpx.Request) -> httpx.Response:
         try:
             # When: invoking a scorer with project context
             response = await client.invoke(
-                metric="toxicity",
+                scorer_label="toxicity",
                 output="model answer",
                 project_id="12345678-1234-5678-1234-567812345678",
             )
@@ -255,7 +242,7 @@ def handler(request: httpx.Request) -> httpx.Response:
             "output": "model answer",
             "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
-            "step_type": "span",
+            "root_type": "span",
         }
         headers = captured["headers"]
         assert isinstance(headers, dict)
@@ -278,7 +265,7 @@ async def test_client_requires_project_id_for_internal_jwt(self) -> None:
 
         # When/Then: project_id is required because API uses it as the internal auth context
         with pytest.raises(ValueError, match="project_id is required"):
-            await client.invoke(metric="toxicity", output="model answer")
+            await client.invoke(scorer_label="toxicity", output="model answer")
 
 
 class TestLunaEvaluator:
@@ -296,7 +283,7 @@ def test_evaluator_init_without_auth_raises(self) -> None:
         from agent_control_evaluator_galileo.luna import LunaEvaluator
 
         with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY or GALILEO_API_KEY"):
-            LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
+            LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
 
     @patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True)
     def test_evaluator_init_accepts_api_secret(self) -> None:
@@ -304,7 +291,7 @@ def test_evaluator_init_accepts_api_secret(self) -> None:
 
         evaluator = LunaEvaluator.from_dict(
             {
-                "metric": "toxicity",
+                "scorer_label": "toxicity",
                 "project_id": "12345678-1234-5678-1234-567812345678",
                 "threshold": 0.5,
             }
@@ -321,7 +308,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
         # Given: a direct Luna evaluator and a raw successful scorer response
         evaluator = LunaEvaluator.from_dict(
             {
-                "metric": "toxicity",
+                "scorer_label": "toxicity",
                 "project_id": "12345678-1234-5678-1234-567812345678",
                 "threshold": 0.7,
                 "operator": "gte",
@@ -350,7 +337,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
         assert result.matched is True
         assert result.confidence == 0.82
         assert result.metadata == {
-            "metric": "toxicity",
+            "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
             "score": 0.82,
             "threshold": 0.7,
@@ -360,7 +347,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
             "error_message": None,
         }
         mock_invoke.assert_awaited_once_with(
-            metric="toxicity",
+            scorer_label="toxicity",
             input="user prompt",
             output="model answer",
             project_id=evaluator.config.project_id,
@@ -376,7 +363,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
 
         # Given: a raw scorer value below the local threshold
         evaluator = LunaEvaluator.from_dict(
-            {"metric": "toxicity", "threshold": 0.7, "operator": "gte"}
+            {"scorer_label": "toxicity", "threshold": 0.7, "operator": "gte"}
         )
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
@@ -393,7 +380,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
         assert result.matched is False
         assert result.confidence == 0.2
         mock_invoke.assert_awaited_once_with(
-            metric="toxicity",
+            scorer_label="toxicity",
             input="hello",
             output=None,
             project_id=None,
@@ -408,7 +395,7 @@ async def test_evaluator_does_not_call_api_for_empty_data(self) -> None:
         from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
 
         # Given: an evaluator and empty selected data
-        evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
+        evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
             # When: evaluating empty data
@@ -427,7 +414,7 @@ async def test_evaluator_fail_open_sets_error(self) -> None:
         from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
 
         # Given: default fail-open behavior
-        evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
+        evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
             mock_invoke.side_effect = RuntimeError("service unavailable")
@@ -449,7 +436,7 @@ async def test_evaluator_fail_closed_matches_without_error_field(self) -> None:
 
         # Given: fail-closed behavior for scorer errors
         evaluator = LunaEvaluator.from_dict(
-            {"metric": "toxicity", "threshold": 0.5, "on_error": "deny"}
+            {"scorer_label": "toxicity", "threshold": 0.5, "on_error": "deny"}
         )
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
diff --git a/examples/galileo_luna/README.md b/examples/galileo_luna/README.md
index d43a2d71..534ef640 100644
--- a/examples/galileo_luna/README.md
+++ b/examples/galileo_luna/README.md
@@ -33,7 +33,7 @@ export GALILEO_PROJECT_ID="00000000-0000-0000-0000-000000000000"
 Optional scorer settings:
 
 ```bash
-export GALILEO_LUNA_METRIC="toxicity"
+export GALILEO_LUNA_SCORER_LABEL="toxicity"
 export GALILEO_LUNA_THRESHOLD="0.5"
 ```
 
diff --git a/examples/galileo_luna/setup_controls.py b/examples/galileo_luna/setup_controls.py
index 3d325cde..69a36ad5 100644
--- a/examples/galileo_luna/setup_controls.py
+++ b/examples/galileo_luna/setup_controls.py
@@ -23,7 +23,7 @@
 AGENT_DESCRIPTION = "Demo agent protected by direct Galileo Luna scorer controls"
 SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000")
 
-LUNA_METRIC = os.getenv("GALILEO_LUNA_METRIC", "toxicity")
+LUNA_SCORER_LABEL = os.getenv("GALILEO_LUNA_SCORER_LABEL", "toxicity")
 LUNA_THRESHOLD = float(os.getenv("GALILEO_LUNA_THRESHOLD", "0.5"))
 GALILEO_PROJECT_ID = os.getenv("GALILEO_PROJECT_ID")
 
@@ -41,7 +41,7 @@
 def luna_config() -> dict[str, Any]:
     """Build the direct Luna evaluator config used by the composite control."""
     config: dict[str, Any] = {
-        "metric": LUNA_METRIC,
+        "scorer_label": LUNA_SCORER_LABEL,
         "threshold": LUNA_THRESHOLD,
         "operator": "gte",
         "payload_field": "output",
@@ -158,7 +158,7 @@ async def setup_demo() -> None:
     print("Setting up direct Galileo Luna demo controls")
     print(f"Server: {SERVER_URL}")
     print(f"Agent:  {AGENT_NAME}")
-    print(f"Luna:   metric={LUNA_METRIC!r}, threshold={LUNA_THRESHOLD}")
+    print(f"Luna:   scorer_label={LUNA_SCORER_LABEL!r}, threshold={LUNA_THRESHOLD}")
     if GALILEO_PROJECT_ID:
         print(f"Project ID: {GALILEO_PROJECT_ID}")
 

From 523524d07fb9837fa574106fe6346a07f25e25be Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Wed, 13 May 2026 17:37:14 -0700
Subject: [PATCH 07/20] update the schemas for scorer

---
 .../luna/__init__.py                          |  2 +
 .../luna/client.py                            | 33 ++++++++--------
 .../galileo/tests/test_luna_evaluator.py      | 37 +++++++++---------
 .../src/agent_control/evaluators/__init__.py  | 38 +++++++++++--------
 4 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py
index c3ff0375..b26feaac 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/__init__.py
@@ -2,6 +2,7 @@
 
 from agent_control_evaluator_galileo.luna.client import (
     GalileoLunaClient,
+    ScorerInvokeInputs,
     ScorerInvokeRequest,
     ScorerInvokeResponse,
 )
@@ -10,6 +11,7 @@
 
 __all__ = [
     "GalileoLunaClient",
+    "ScorerInvokeInputs",
     "ScorerInvokeRequest",
     "ScorerInvokeResponse",
     "LunaEvaluatorConfig",
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index 426b1782..a2ccdc3f 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -9,7 +9,6 @@
 from hmac import new as hmac_new
 from json import dumps
 from time import time
-from typing import Literal
 from uuid import UUID
 
 import httpx
@@ -66,32 +65,38 @@ def _as_float_or_none(value: JSONValue) -> float | None:
     return None
 
 
-RootType = Literal["session", "trace", "span"]
+def _has_value(value: JSONValue) -> bool:
+    return value is not None and value != ""
+
+
+class ScorerInvokeInputs(BaseModel):
+    """Input values sent to Galileo's scorer invoke API."""
+
+    query: JSONValue = ""
+    response: JSONValue = ""
+    ground_truth: JSONValue = None
+    tools: JSONValue = None
 
 
 class ScorerInvokeRequest(BaseModel):
     """Request payload for Galileo Luna scorer invocation.
 
     Attributes:
-        root_type: Runtime step shape used by Galileo scorer input normalization.
-        input: Optional user/system prompt text.
-        output: Optional model response text.
+        inputs: Selected scorer input values.
         scorer_label: Preset, registered, or fine-tuned scorer label.
         project_id: Optional Galileo project UUID for project-scoped scorer resolution.
         config: Optional scorer-specific configuration.
     """
 
-    root_type: RootType = Field(default="span")
-    input: JSONValue = None
-    output: JSONValue = None
     scorer_label: str = Field(min_length=1)
+    inputs: ScorerInvokeInputs
     project_id: str | UUID | None = None
     config: JSONObject | None = None
 
     @model_validator(mode="after")
     def ensure_input_or_output(self) -> ScorerInvokeRequest:
-        if self.input is None and self.output is None:
-            raise ValueError("Either input or output must be set.")
+        if not (_has_value(self.inputs.query) or _has_value(self.inputs.response)):
+            raise ValueError("Either inputs.query or inputs.response must be set.")
         return self
 
     def to_dict(self) -> JSONObject:
@@ -234,7 +239,6 @@ async def invoke(
         scorer_label: str,
         input: JSONValue = None,
         output: JSONValue = None,
-        root_type: RootType = "span",
         project_id: str | UUID | None = None,
         config: JSONObject | None = None,
         timeout: float = DEFAULT_TIMEOUT_SECS,
@@ -246,7 +250,6 @@ async def invoke(
             scorer_label: Preset, registered, or fine-tuned scorer label.
             input: Optional user/system prompt text.
             output: Optional model response text.
-            root_type: Runtime step shape used by Galileo scorer input normalization.
             project_id: Optional Galileo project UUID for project-scoped scorer resolution.
             config: Optional scorer-specific configuration.
             timeout: Request timeout in seconds.
@@ -266,9 +269,9 @@ async def invoke(
 
         request_body = ScorerInvokeRequest(
             scorer_label=scorer_label,
-            input=input,
-            output=output,
-            root_type=root_type,
+            inputs=ScorerInvokeInputs(
+                query="" if input is None else input, response="" if output is None else output
+            ),
             project_id=project_id,
             config=config,
         ).to_dict()
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 31323a42..9f4ae862 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -52,22 +52,24 @@ def test_numeric_operator_requires_numeric_threshold(self) -> None:
 class TestGalileoLunaClient:
     """Tests for the GalileoLunaClient HTTP contract."""
 
-    def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None:
-        from agent_control_evaluator_galileo.luna import ScorerInvokeRequest
+    def test_scorer_invoke_request_matches_api_schema_shape(self) -> None:
+        from agent_control_evaluator_galileo.luna import ScorerInvokeInputs, ScorerInvokeRequest
 
         # Given: a scorer request with project context and scorer config
         request = ScorerInvokeRequest(
             scorer_label="toxicity",
-            input={"messages": [{"role": "user", "content": "hello"}]},
+            inputs=ScorerInvokeInputs(query={"messages": [{"role": "user", "content": "hello"}]}),
             project_id="12345678-1234-5678-1234-567812345678",
             config={"top_k": 1},
         )
 
-        # Then: the serialized payload uses the Orbit scorer invoke fields
+        # Then: the serialized payload uses the API-owned scorer invoke fields
         assert request.to_dict() == {
-            "root_type": "span",
-            "input": {"messages": [{"role": "user", "content": "hello"}]},
             "scorer_label": "toxicity",
+            "inputs": {
+                "query": {"messages": [{"role": "user", "content": "hello"}]},
+                "response": "",
+            },
             "project_id": "12345678-1234-5678-1234-567812345678",
             "config": {"top_k": 1},
         }
@@ -75,11 +77,13 @@ def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None:
     def test_scorer_invoke_request_requires_input_or_output(self) -> None:
         from agent_control_evaluator_galileo.luna import ScorerInvokeRequest
 
-        # Given/When/Then: the request mirrors Orbit validation
-        with pytest.raises(ValidationError, match="Either input or output must be set"):
-            ScorerInvokeRequest(scorer_label="toxicity")
+        # Given/When/Then: the request mirrors API validation
+        with pytest.raises(
+            ValidationError, match="Either inputs.query or inputs.response must be set"
+        ):
+            ScorerInvokeRequest(scorer_label="toxicity", inputs={})
 
-    def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None:
+    def test_scorer_invoke_response_matches_api_schema_shape(self) -> None:
         from agent_control_evaluator_galileo.luna import ScorerInvokeResponse
 
         # Given: an API scorer invoke response
@@ -93,7 +97,7 @@ def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None:
             }
         )
 
-        # Then: the model exposes the Orbit/API response fields
+        # Then: the model exposes the API response fields
         assert response.model_dump() == {
             "scorer_label": "toxicity",
             "score": 0.82,
@@ -187,11 +191,9 @@ def handler(request: httpx.Request) -> httpx.Response:
         assert response.score == 0.82
         assert captured["url"] == "https://api.demo-v2.galileocloud.io/scorers/invoke"
         assert captured["body"] == {
-            "input": "user prompt",
-            "output": "model answer",
             "scorer_label": "toxicity",
+            "inputs": {"query": "user prompt", "response": "model answer"},
             "project_id": "12345678-1234-5678-1234-567812345678",
-            "root_type": "span",
             "config": {"top_k": 1},
         }
         assert "stage_name" not in captured["body"]
@@ -237,12 +239,13 @@ def handler(request: httpx.Request) -> httpx.Response:
 
         # Then: the internal scorer endpoint is called with a project-bound JWT
         assert response.score == 0.82
-        assert captured["url"] == "https://api.default.svc.cluster.local:8088/internal/scorers/invoke"
+        assert (
+            captured["url"] == "https://api.default.svc.cluster.local:8088/internal/scorers/invoke"
+        )
         assert captured["body"] == {
-            "output": "model answer",
             "scorer_label": "toxicity",
+            "inputs": {"query": "", "response": "model answer"},
             "project_id": "12345678-1234-5678-1234-567812345678",
-            "root_type": "span",
         }
         headers = captured["headers"]
         assert isinstance(headers, dict)
diff --git a/sdks/python/src/agent_control/evaluators/__init__.py b/sdks/python/src/agent_control/evaluators/__init__.py
index 9fd87e71..8366a107 100644
--- a/sdks/python/src/agent_control/evaluators/__init__.py
+++ b/sdks/python/src/agent_control/evaluators/__init__.py
@@ -44,19 +44,23 @@
         LunaEvaluator,
         LunaEvaluatorConfig,
         LunaOperator,
+        ScorerInvokeInputs,
         ScorerInvokeRequest,
         ScorerInvokeResponse,
     )
 
-    __all__.extend([
-        "GalileoLunaClient",
-        "ScorerInvokeRequest",
-        "ScorerInvokeResponse",
-        "LunaEvaluator",
-        "LunaEvaluatorConfig",
-        "LunaOperator",
-        "LUNA_AVAILABLE",
-    ])
+    __all__.extend(
+        [
+            "GalileoLunaClient",
+            "ScorerInvokeInputs",
+            "ScorerInvokeRequest",
+            "ScorerInvokeResponse",
+            "LunaEvaluator",
+            "LunaEvaluatorConfig",
+            "LunaOperator",
+            "LUNA_AVAILABLE",
+        ]
+    )
 except ImportError:
     pass
 
@@ -69,12 +73,14 @@
         Luna2Operator,
     )
 
-    __all__.extend([
-        "Luna2Evaluator",
-        "Luna2EvaluatorConfig",
-        "Luna2Metric",
-        "Luna2Operator",
-        "LUNA2_AVAILABLE",
-    ])
+    __all__.extend(
+        [
+            "Luna2Evaluator",
+            "Luna2EvaluatorConfig",
+            "Luna2Metric",
+            "Luna2Operator",
+            "LUNA2_AVAILABLE",
+        ]
+    )
 except ImportError:
     pass

From 34f430df0b8934a670286ea4c9712254fd35e748 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Wed, 13 May 2026 21:56:33 -0700
Subject: [PATCH 08/20] update luna client schemas

---
 .../luna/client.py                            | 10 +++++++--
 .../galileo/tests/test_luna_evaluator.py      | 21 +++++++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index a2ccdc3f..86033339 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -66,7 +66,13 @@ def _as_float_or_none(value: JSONValue) -> float | None:
 
 
 def _has_value(value: JSONValue) -> bool:
-    return value is not None and value != ""
+    if value is None:
+        return False
+    if isinstance(value, str):
+        return value.strip() != ""
+    if isinstance(value, (list, dict)):
+        return len(value) > 0
+    return True
 
 
 class ScorerInvokeInputs(BaseModel):
@@ -264,7 +270,7 @@ async def invoke(
             httpx.HTTPStatusError: If the API returns an error status code.
             httpx.RequestError: If the request fails before a response is received.
         """
-        if input is None and output is None:
+        if not (_has_value(input) or _has_value(output)):
             raise ValueError("At least one of input or output must be provided.")
 
         request_body = ScorerInvokeRequest(
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 9f4ae862..80a5e00b 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -74,14 +74,18 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None:
             "config": {"top_k": 1},
         }
 
-    def test_scorer_invoke_request_requires_input_or_output(self) -> None:
+    @pytest.mark.parametrize("empty_value", ["", " ", {}, []])
+    def test_scorer_invoke_request_requires_input_or_output(self, empty_value: object) -> None:
         from agent_control_evaluator_galileo.luna import ScorerInvokeRequest
 
         # Given/When/Then: the request mirrors API validation
         with pytest.raises(
             ValidationError, match="Either inputs.query or inputs.response must be set"
         ):
-            ScorerInvokeRequest(scorer_label="toxicity", inputs={})
+            ScorerInvokeRequest(
+                scorer_label="toxicity",
+                inputs={"query": empty_value, "response": empty_value},
+            )
 
     def test_scorer_invoke_response_matches_api_schema_shape(self) -> None:
         from agent_control_evaluator_galileo.luna import ScorerInvokeResponse
@@ -270,6 +274,19 @@ async def test_client_requires_project_id_for_internal_jwt(self) -> None:
         with pytest.raises(ValueError, match="project_id is required"):
             await client.invoke(scorer_label="toxicity", output="model answer")
 
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("empty_value", ["", " ", {}, []])
+    async def test_client_rejects_missing_input_and_output_values(self, empty_value: object) -> None:
+        from agent_control_evaluator_galileo.luna import GalileoLunaClient
+
+        # Given: a Luna client and scorer input values that API treats as missing
+        with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}, clear=True):
+            client = GalileoLunaClient(api_url="https://api.default.svc.cluster.local:8088")
+
+        # When/Then: the client rejects the request before calling API
+        with pytest.raises(ValueError, match="At least one of input or output must be provided"):
+            await client.invoke(scorer_label="toxicity", input=empty_value, output=empty_value)
+
 
 class TestLunaEvaluator:
     """Tests for direct Luna evaluator behavior."""

From ad0b2dc98b30fcaffe0c5897cfee08e96de83e03 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Wed, 13 May 2026 21:59:37 -0700
Subject: [PATCH 09/20] fix tests

---
 evaluators/contrib/galileo/tests/test_luna_evaluator.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 80a5e00b..5cf1fcf8 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -276,7 +276,9 @@ async def test_client_requires_project_id_for_internal_jwt(self) -> None:
 
     @pytest.mark.asyncio
     @pytest.mark.parametrize("empty_value", ["", " ", {}, []])
-    async def test_client_rejects_missing_input_and_output_values(self, empty_value: object) -> None:
+    async def test_client_rejects_missing_input_and_output_values(
+        self, empty_value: object
+    ) -> None:
         from agent_control_evaluator_galileo.luna import GalileoLunaClient
 
         # Given: a Luna client and scorer input values that API treats as missing

From 81cea0471518dd22e0964889412155d5122881de Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Thu, 14 May 2026 15:11:17 -0700
Subject: [PATCH 10/20] remove unwanted fields

---
 .../luna/config.py                            | 16 -----------
 .../luna/evaluator.py                         | 15 ++---------
 .../galileo/tests/test_luna_evaluator.py      | 27 ++-----------------
 3 files changed, 4 insertions(+), 54 deletions(-)

diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
index 1e41a554..7bf5de48 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
@@ -38,10 +38,6 @@ class LunaEvaluatorConfig(EvaluatorConfig):
         operator: Local comparison operator. Numeric operators use threshold as a number.
         scorer_config: Optional scorer-specific config sent as ``config``.
         timeout_ms: Request timeout in milliseconds.
-        on_error: Error policy: allow=fail open, deny=fail closed.
-        payload_field: Force selected data into input or output. If omitted, root step
-            payloads with input/output use both fields; scalar data is inferred from scorer label.
-        include_raw_response: Include the raw API response in EvaluatorResult metadata.
     """
 
     scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke")
@@ -69,18 +65,6 @@ class LunaEvaluatorConfig(EvaluatorConfig):
         le=60000,
         description="Request timeout in milliseconds (1-60 seconds)",
     )
-    on_error: Literal["allow", "deny"] = Field(
-        default="allow",
-        description="Action on error: 'allow' (fail open) or 'deny' (fail closed)",
-    )
-    payload_field: Literal["input", "output"] | None = Field(
-        default=None,
-        description="Explicitly set which scorer payload field receives scalar selected data.",
-    )
-    include_raw_response: bool = Field(
-        default=False,
-        description="Include the raw scorer response in result metadata.",
-    )
 
     @model_validator(mode="after")
     def validate_threshold(self) -> LunaEvaluatorConfig:
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
index a5b3f248..f9e0ad0d 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -126,12 +126,6 @@ def _get_client(self) -> GalileoLunaClient:
 
     def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]:
         """Prepare scorer input/output fields from selected data."""
-        if self.config.payload_field is not None:
-            text = _coerce_payload_text(data)
-            if self.config.payload_field == "output":
-                return None, text
-            return text, None
-
         if isinstance(data, dict):
             input_text = _extract_dict_text(data, "input")
             output_text = _extract_dict_text(data, "output")
@@ -236,25 +230,20 @@ def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]:
             "execution_time_seconds": response.execution_time,
             "error_message": response.error_message,
         }
-        if self.config.include_raw_response:
-            metadata["raw_response"] = response.raw_response
         return metadata
 
     def _handle_error(self, error: Exception) -> EvaluatorResult:
-        fallback = self.config.on_error
-        matched = fallback == "deny"
         error_detail = str(error)
         return EvaluatorResult(
-            matched=matched,
+            matched=False,
             confidence=0.0,
             message=f"Luna evaluation error: {error_detail}",
             metadata={
                 "error": error_detail,
                 "error_type": type(error).__name__,
                 "scorer_label": self.config.scorer_label,
-                "fallback_action": fallback,
             },
-            error=None if matched else error_detail,
+            error=error_detail,
         )
 
     async def aclose(self) -> None:
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 5cf1fcf8..1b0bcef8 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -435,7 +435,7 @@ async def test_evaluator_fail_open_sets_error(self) -> None:
         from agent_control_evaluator_galileo.luna import LunaEvaluator
         from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
 
-        # Given: default fail-open behavior
+        # Given: fixed fail-open behavior for scorer errors
         evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
@@ -448,27 +448,4 @@ async def test_evaluator_fail_open_sets_error(self) -> None:
         assert result.matched is False
         assert result.error == "service unavailable"
         assert result.metadata is not None
-        assert result.metadata["fallback_action"] == "allow"
-
-    @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
-    @pytest.mark.asyncio
-    async def test_evaluator_fail_closed_matches_without_error_field(self) -> None:
-        from agent_control_evaluator_galileo.luna import LunaEvaluator
-        from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
-
-        # Given: fail-closed behavior for scorer errors
-        evaluator = LunaEvaluator.from_dict(
-            {"scorer_label": "toxicity", "threshold": 0.5, "on_error": "deny"}
-        )
-
-        with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
-            mock_invoke.side_effect = RuntimeError("service unavailable")
-
-            # When: the scorer call fails
-            result = await evaluator.evaluate("hello")
-
-        # Then: the control matches so deny/steer actions can be applied by the engine
-        assert result.matched is True
-        assert result.error is None
-        assert result.metadata is not None
-        assert result.metadata["fallback_action"] == "deny"
+        assert "fallback_action" not in result.metadata

From f3cf8f72609c599833542c75a0b0c408255e789c Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Thu, 14 May 2026 16:41:04 -0700
Subject: [PATCH 11/20] remove project_id from evaluator config

---
 .../luna/client.py                            | 20 +------
 .../luna/config.py                            |  6 --
 .../luna/evaluator.py                         |  2 -
 .../galileo/tests/test_luna_evaluator.py      | 57 ++++++++++---------
 4 files changed, 33 insertions(+), 52 deletions(-)

diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index 86033339..caca997e 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -9,7 +9,6 @@
 from hmac import new as hmac_new
 from json import dumps
 from time import time
-from uuid import UUID
 
 import httpx
 from agent_control_models import JSONObject, JSONValue
@@ -29,7 +28,6 @@ def _b64url(data: bytes) -> str:
 
 def _internal_auth_token(
     api_secret: str,
-    project_id: str | UUID,
     ttl_seconds: int = DEFAULT_INTERNAL_TOKEN_TTL_SECS,
 ) -> str:
     """Create the internal JWT expected by Galileo API internal routes."""
@@ -37,7 +35,6 @@ def _internal_auth_token(
     header = {"alg": "HS256", "typ": "JWT"}
     payload = {
         "internal": True,
-        "project_id": str(project_id),
         "scope": "scorers.invoke",
         "iat": now,
         "exp": now + ttl_seconds,
@@ -90,13 +87,11 @@ class ScorerInvokeRequest(BaseModel):
     Attributes:
         inputs: Selected scorer input values.
         scorer_label: Preset, registered, or fine-tuned scorer label.
-        project_id: Optional Galileo project UUID for project-scoped scorer resolution.
         config: Optional scorer-specific configuration.
     """
 
     scorer_label: str = Field(min_length=1)
     inputs: ScorerInvokeInputs
-    project_id: str | UUID | None = None
     config: JSONObject | None = None
 
     @model_validator(mode="after")
@@ -222,21 +217,13 @@ async def _get_client(self) -> httpx.AsyncClient:
 
     def _endpoint_and_headers(
         self,
-        project_id: str | UUID | None,
         headers: dict[str, str] | None,
     ) -> tuple[str, dict[str, str]]:
         request_headers = dict(headers or {})
         if self.api_secret is None:
             return f"{self.api_base}{PUBLIC_SCORER_INVOKE_PATH}", request_headers
 
-        if project_id is None:
-            raise ValueError(
-                "project_id is required when using GALILEO_API_SECRET_KEY internal auth."
-            )
-
-        request_headers["Authorization"] = (
-            f"Bearer {_internal_auth_token(self.api_secret, project_id)}"
-        )
+        request_headers["Authorization"] = f"Bearer {_internal_auth_token(self.api_secret)}"
         return f"{self.api_base}{INTERNAL_SCORER_INVOKE_PATH}", request_headers
 
     async def invoke(
@@ -245,7 +232,6 @@ async def invoke(
         scorer_label: str,
         input: JSONValue = None,
         output: JSONValue = None,
-        project_id: str | UUID | None = None,
         config: JSONObject | None = None,
         timeout: float = DEFAULT_TIMEOUT_SECS,
         headers: dict[str, str] | None = None,
@@ -256,7 +242,6 @@ async def invoke(
             scorer_label: Preset, registered, or fine-tuned scorer label.
             input: Optional user/system prompt text.
             output: Optional model response text.
-            project_id: Optional Galileo project UUID for project-scoped scorer resolution.
             config: Optional scorer-specific configuration.
             timeout: Request timeout in seconds.
             headers: Additional request headers.
@@ -278,10 +263,9 @@ async def invoke(
             inputs=ScorerInvokeInputs(
                 query="" if input is None else input, response="" if output is None else output
             ),
-            project_id=project_id,
             config=config,
         ).to_dict()
-        endpoint, request_headers = self._endpoint_and_headers(project_id, headers)
+        endpoint, request_headers = self._endpoint_and_headers(headers)
 
         logger.debug("[GalileoLunaClient] POST %s", endpoint)
         logger.debug("[GalileoLunaClient] Request body: %s", request_body)
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
index 7bf5de48..0f0d86d5 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 from typing import Literal
-from uuid import UUID
 
 from agent_control_evaluators import EvaluatorConfig
 from agent_control_models import JSONObject, JSONValue
@@ -33,7 +32,6 @@ class LunaEvaluatorConfig(EvaluatorConfig):
 
     Attributes:
         scorer_label: Preset, registered, or fine-tuned scorer label.
-        project_id: Optional Galileo project UUID for project-scoped scorer resolution.
         threshold: Local threshold used by the evaluator for comparison.
         operator: Local comparison operator. Numeric operators use threshold as a number.
         scorer_config: Optional scorer-specific config sent as ``config``.
@@ -41,10 +39,6 @@ class LunaEvaluatorConfig(EvaluatorConfig):
     """
 
     scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke")
-    project_id: UUID | None = Field(
-        default=None,
-        description="Optional Galileo project UUID for project-scoped scorer resolution.",
-    )
     threshold: JSONValue = Field(
         default=0.5,
         description="Local threshold used to decide whether the control matches.",
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
index f9e0ad0d..15798074 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -192,7 +192,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
                 scorer_label=self.config.scorer_label,
                 input=input_text if _has_text(input_text) else None,
                 output=output_text if _has_text(output_text) else None,
-                project_id=self.config.project_id,
                 config=self.config.scorer_config,
                 timeout=self.get_timeout_seconds(),
             )
@@ -222,7 +221,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
     def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]:
         metadata: dict[str, Any] = {
             "scorer_label": response.scorer_label or self.config.scorer_label,
-            "project_id": str(self.config.project_id) if self.config.project_id else None,
             "score": response.score,
             "threshold": self.config.threshold,
             "operator": self.config.operator,
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 1b0bcef8..4e1f45b8 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -28,7 +28,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
         # Given: a direct scorer config with local thresholding
         config = LunaEvaluatorConfig(
             scorer_label="toxicity",
-            project_id="12345678-1234-5678-1234-567812345678",
             threshold=0.7,
             operator="gte",
             config={"temperature": 0},
@@ -36,7 +35,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
 
         # Then: config is retained without Protect concepts
         assert config.scorer_label == "toxicity"
-        assert str(config.project_id) == "12345678-1234-5678-1234-567812345678"
         assert config.threshold == 0.7
         assert config.operator == "gte"
         assert config.scorer_config == {"temperature": 0}
@@ -55,11 +53,10 @@ class TestGalileoLunaClient:
     def test_scorer_invoke_request_matches_api_schema_shape(self) -> None:
         from agent_control_evaluator_galileo.luna import ScorerInvokeInputs, ScorerInvokeRequest
 
-        # Given: a scorer request with project context and scorer config
+        # Given: a scorer request with scorer config
         request = ScorerInvokeRequest(
             scorer_label="toxicity",
             inputs=ScorerInvokeInputs(query={"messages": [{"role": "user", "content": "hello"}]}),
-            project_id="12345678-1234-5678-1234-567812345678",
             config={"top_k": 1},
         )
 
@@ -70,7 +67,6 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None:
                 "query": {"messages": [{"role": "user", "content": "hello"}]},
                 "response": "",
             },
-            "project_id": "12345678-1234-5678-1234-567812345678",
             "config": {"top_k": 1},
         }
 
@@ -185,7 +181,6 @@ def handler(request: httpx.Request) -> httpx.Response:
                 scorer_label="toxicity",
                 input="user prompt",
                 output="model answer",
-                project_id="12345678-1234-5678-1234-567812345678",
                 config={"top_k": 1},
             )
         finally:
@@ -197,7 +192,6 @@ def handler(request: httpx.Request) -> httpx.Response:
         assert captured["body"] == {
             "scorer_label": "toxicity",
             "inputs": {"query": "user prompt", "response": "model answer"},
-            "project_id": "12345678-1234-5678-1234-567812345678",
             "config": {"top_k": 1},
         }
         assert "stage_name" not in captured["body"]
@@ -232,16 +226,12 @@ def handler(request: httpx.Request) -> httpx.Response:
         client._client = httpx.AsyncClient(transport=httpx.MockTransport(handler))
 
         try:
-            # When: invoking a scorer with project context
-            response = await client.invoke(
-                scorer_label="toxicity",
-                output="model answer",
-                project_id="12345678-1234-5678-1234-567812345678",
-            )
+            # When: invoking a scorer with internal JWT auth
+            response = await client.invoke(scorer_label="toxicity", output="model answer")
         finally:
             await client.close()
 
-        # Then: the internal scorer endpoint is called with a project-bound JWT
+        # Then: the internal scorer endpoint is called with an internal JWT
         assert response.score == 0.82
         assert (
             captured["url"] == "https://api.default.svc.cluster.local:8088/internal/scorers/invoke"
@@ -249,7 +239,6 @@ def handler(request: httpx.Request) -> httpx.Response:
         assert captured["body"] == {
             "scorer_label": "toxicity",
             "inputs": {"query": "", "response": "model answer"},
-            "project_id": "12345678-1234-5678-1234-567812345678",
         }
         headers = captured["headers"]
         assert isinstance(headers, dict)
@@ -259,20 +248,41 @@ def handler(request: httpx.Request) -> httpx.Response:
         assert auth_header.startswith("Bearer ")
         token_payload = _decode_jwt_payload(auth_header.removeprefix("Bearer "))
         assert token_payload["internal"] is True
-        assert token_payload["project_id"] == "12345678-1234-5678-1234-567812345678"
         assert token_payload["scope"] == "scorers.invoke"
 
     @pytest.mark.asyncio
-    async def test_client_requires_project_id_for_internal_jwt(self) -> None:
+    async def test_client_uses_internal_jwt_without_api_key(self) -> None:
         from agent_control_evaluator_galileo.luna import GalileoLunaClient
 
         # Given: a Luna client configured with internal JWT auth
         with patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True):
             client = GalileoLunaClient(api_url="https://api.default.svc.cluster.local:8088")
 
-        # When/Then: project_id is required because API uses it as the internal auth context
-        with pytest.raises(ValueError, match="project_id is required"):
-            await client.invoke(scorer_label="toxicity", output="model answer")
+        captured: dict[str, object] = {}
+
+        def handler(request: httpx.Request) -> httpx.Response:
+            captured["headers"] = dict(request.headers)
+            return httpx.Response(
+                200,
+                json={"scorer_label": "toxicity", "score": 0.82, "status": "success"},
+            )
+
+        client._client = httpx.AsyncClient(transport=httpx.MockTransport(handler))
+        try:
+            # When: invoking without project context
+            response = await client.invoke(scorer_label="toxicity", output="model answer")
+        finally:
+            await client.close()
+
+        # Then: internal JWT auth still works
+        assert response.score == 0.82
+        headers = captured["headers"]
+        assert isinstance(headers, dict)
+        auth_header = headers["authorization"]
+        assert isinstance(auth_header, str)
+        token_payload = _decode_jwt_payload(auth_header.removeprefix("Bearer "))
+        assert token_payload["internal"] is True
+        assert token_payload["scope"] == "scorers.invoke"
 
     @pytest.mark.asyncio
     @pytest.mark.parametrize("empty_value", ["", " ", {}, []])
@@ -314,12 +324,11 @@ def test_evaluator_init_accepts_api_secret(self) -> None:
         evaluator = LunaEvaluator.from_dict(
             {
                 "scorer_label": "toxicity",
-                "project_id": "12345678-1234-5678-1234-567812345678",
                 "threshold": 0.5,
             }
         )
 
-        assert str(evaluator.config.project_id) == "12345678-1234-5678-1234-567812345678"
+        assert evaluator.config.scorer_label == "toxicity"
 
     @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
     @pytest.mark.asyncio
@@ -331,7 +340,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
         evaluator = LunaEvaluator.from_dict(
             {
                 "scorer_label": "toxicity",
-                "project_id": "12345678-1234-5678-1234-567812345678",
                 "threshold": 0.7,
                 "operator": "gte",
                 "timeout_ms": 5000,
@@ -360,7 +368,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
         assert result.confidence == 0.82
         assert result.metadata == {
             "scorer_label": "toxicity",
-            "project_id": "12345678-1234-5678-1234-567812345678",
             "score": 0.82,
             "threshold": 0.7,
             "operator": "gte",
@@ -372,7 +379,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
             scorer_label="toxicity",
             input="user prompt",
             output="model answer",
-            project_id=evaluator.config.project_id,
             config=None,
             timeout=5.0,
         )
@@ -405,7 +411,6 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
             scorer_label="toxicity",
             input="hello",
             output=None,
-            project_id=None,
             config=None,
             timeout=10.0,
         )

From 025f96f52e5d5d55330acd57a6ee300d8eb90616 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Fri, 15 May 2026 11:28:25 -0700
Subject: [PATCH 12/20] add evaluator context

---
 engine/src/agent_control_engine/core.py       |   9 +-
 engine/tests/test_core.py                     | 192 ++++++++++++++++++
 .../src/agent_control_evaluators/__init__.py  |   8 +-
 .../src/agent_control_evaluators/_base.py     |  62 +++++-
 evaluators/builtin/tests/test_base.py         | 120 ++++++++++-
 .../luna/client.py                            |  31 ++-
 .../luna/config.py                            |  28 ++-
 .../luna/evaluator.py                         |  75 ++++++-
 .../galileo/tests/test_luna_evaluator.py      |  66 ++++++
 models/src/agent_control_models/__init__.py   |   2 +
 models/src/agent_control_models/evaluation.py |  17 ++
 .../integrations/google_adk/plugin.py         |  17 +-
 12 files changed, 599 insertions(+), 28 deletions(-)

diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py
index 99c2273b..0a8f1864 100644
--- a/engine/src/agent_control_engine/core.py
+++ b/engine/src/agent_control_engine/core.py
@@ -18,6 +18,7 @@
     ControlAction,
     ControlMatch,
     ControlScope,
+    EvaluationContext,
     EvaluationRequest,
     EvaluationResponse,
     EvaluatorResult,
@@ -188,8 +189,14 @@ async def _evaluate_leaf(
                 if timeout <= 0:
                     timeout = DEFAULT_EVALUATOR_TIMEOUT
 
+                context = EvaluationContext(
+                    target_type=request.target_type,
+                    target_id=request.target_id,
+                    agent_name=request.agent_name,
+                    step_type=request.step.type,
+                )
                 result = await asyncio.wait_for(
-                    evaluator.evaluate(data),
+                    evaluator.evaluate_with_context(data, context),
                     timeout=timeout,
                 )
         except TimeoutError:
diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py
index 9c8da751..37030998 100644
--- a/engine/tests/test_core.py
+++ b/engine/tests/test_core.py
@@ -2353,3 +2353,195 @@ class MockControl:
         assert selector_errors[0].result.error is not None
         assert "Invalid step_name_regex" in selector_errors[0].result.error
         assert "[invalid(regex" in selector_errors[0].result.message
+
+
+# =============================================================================
+# Test: EvaluationContext Propagation
+# =============================================================================
+
+
+_observed_contexts: list[Any] = []
+
+
+def _reset_observed_contexts() -> None:
+    _observed_contexts.clear()
+
+
+class ContextRecordingEvaluator(Evaluator[SimpleConfig]):
+    """Evaluator that overrides evaluate_with_context to record the context.
+
+    Verifies that the engine populates EvaluationContext correctly. Records
+    every (data, context) pair it observes so concurrent invocations on a
+    shared instance can be inspected.
+    """
+
+    metadata = EvaluatorMetadata(
+        name="test-context-recorder",
+        version="1.0.0",
+        description="Records EvaluationContext",
+    )
+    config_model = SimpleConfig
+
+    async def evaluate(self, data: Any) -> EvaluatorResult:
+        # Should not be hit when the engine routes through evaluate_with_context.
+        _observed_contexts.append(("evaluate-fallback", data, None))
+        return EvaluatorResult(matched=False, confidence=1.0, message="fallback")
+
+    async def evaluate_with_context(
+        self,
+        data: Any,
+        context: Any = None,
+    ) -> EvaluatorResult:
+        _observed_contexts.append(("with-context", data, context))
+        return EvaluatorResult(matched=False, confidence=1.0, message="ok")
+
+
+class LegacyOnlyEvaluator(Evaluator[SimpleConfig]):
+    """Evaluator that overrides only evaluate(data), proving the default
+    evaluate_with_context fallback routes back to it.
+    """
+
+    metadata = EvaluatorMetadata(
+        name="test-legacy-only",
+        version="1.0.0",
+        description="Legacy signature only",
+    )
+    config_model = SimpleConfig
+
+    async def evaluate(self, data: Any) -> EvaluatorResult:
+        _observed_contexts.append(("legacy-evaluate", data, None))
+        return EvaluatorResult(matched=False, confidence=1.0, message="legacy")
+
+
+class TestEvaluationContextPropagation:
+    """Verify the engine populates and forwards EvaluationContext correctly."""
+
+    @pytest.fixture(autouse=True)
+    def _setup(self):
+        _reset_observed_contexts()
+        # Register the local fixtures (idempotent).
+        for cls in (ContextRecordingEvaluator, LegacyOnlyEvaluator):
+            try:
+                register_evaluator(cls)
+            except ValueError:
+                pass
+        yield
+        _reset_observed_contexts()
+
+    @pytest.mark.asyncio
+    async def test_engine_populates_context_from_request(self):
+        """Engine builds an EvaluationContext from the request and passes it
+        to ``evaluate_with_context``.
+        """
+        from agent_control_models import EvaluationContext
+
+        controls = [
+            make_control(1, "ctx1", "test-context-recorder", action="observe"),
+        ]
+        engine = ControlEngine(controls)
+
+        request = EvaluationRequest(
+            agent_name="00000000-0000-0000-0000-000000000001",
+            step=Step(type="llm", name="step-x", input="hello", output=None),
+            stage="pre",
+            target_type="log_stream",
+            target_id="ls-42",
+        )
+        await engine.process(request)
+
+        with_context_observations = [
+            entry for entry in _observed_contexts if entry[0] == "with-context"
+        ]
+        assert len(with_context_observations) == 1, _observed_contexts
+        _, _, context = with_context_observations[0]
+        assert isinstance(context, EvaluationContext)
+        assert context.target_type == "log_stream"
+        assert context.target_id == "ls-42"
+        assert context.agent_name == "00000000-0000-0000-0000-000000000001"
+        assert context.step_type == "llm"
+
+    @pytest.mark.asyncio
+    async def test_engine_passes_context_with_none_target_when_unbound(self):
+        """When the request carries no target binding, target_* on the
+        context are None but the context object is still supplied.
+        """
+        from agent_control_models import EvaluationContext
+
+        controls = [
+            make_control(1, "ctx1", "test-context-recorder", action="observe"),
+        ]
+        engine = ControlEngine(controls)
+
+        request = EvaluationRequest(
+            agent_name="00000000-0000-0000-0000-000000000001",
+            step=Step(type="llm", name="step-y", input="x", output=None),
+            stage="pre",
+            # No target_type / target_id.
+        )
+        await engine.process(request)
+
+        with_context = [e for e in _observed_contexts if e[0] == "with-context"]
+        assert len(with_context) == 1
+        context = with_context[0][2]
+        assert isinstance(context, EvaluationContext)
+        assert context.target_type is None
+        assert context.target_id is None
+        assert context.step_type == "llm"
+
+    @pytest.mark.asyncio
+    async def test_legacy_evaluator_still_works_via_default_fallback(self):
+        """Subclasses overriding only ``evaluate(data)`` keep working: the
+        base ``evaluate_with_context`` default routes back to them.
+        """
+        controls = [
+            make_control(1, "ctx1", "test-legacy-only", action="observe"),
+        ]
+        engine = ControlEngine(controls)
+
+        request = EvaluationRequest(
+            agent_name="00000000-0000-0000-0000-000000000001",
+            step=Step(type="llm", name="step-z", input="hello", output=None),
+            stage="pre",
+            target_type="log_stream",
+            target_id="ls-99",
+        )
+        await engine.process(request)
+
+        legacy_calls = [e for e in _observed_contexts if e[0] == "legacy-evaluate"]
+        assert len(legacy_calls) == 1, _observed_contexts
+        # The legacy entry point receives data only; no context object is
+        # observed because the default forwarder drops it to call ``evaluate``.
+        _, data, _ = legacy_calls[0]
+        assert data == "hello"
+
+    @pytest.mark.asyncio
+    async def test_concurrent_requests_receive_distinct_contexts(self):
+        """A cached instance must observe per-call context, not a shared one."""
+        from agent_control_models import EvaluationContext
+
+        controls = [
+            make_control(1, "ctx1", "test-context-recorder", action="observe"),
+        ]
+        engine = ControlEngine(controls)
+
+        async def fire(target_id: str) -> None:
+            request = EvaluationRequest(
+                agent_name="00000000-0000-0000-0000-000000000001",
+                step=Step(
+                    type="llm", name=f"step-{target_id}", input="hi", output=None
+                ),
+                stage="pre",
+                target_type="log_stream",
+                target_id=target_id,
+            )
+            await engine.process(request)
+
+        await asyncio.gather(*(fire(f"ls-{i}") for i in range(5)))
+
+        with_context = [e for e in _observed_contexts if e[0] == "with-context"]
+        assert len(with_context) == 5
+        observed_target_ids = sorted(
+            context.target_id for _, _, context in with_context
+            if isinstance(context, EvaluationContext)
+        )
+        assert observed_target_ids == ["ls-0", "ls-1", "ls-2", "ls-3", "ls-4"]
diff --git a/evaluators/builtin/src/agent_control_evaluators/__init__.py b/evaluators/builtin/src/agent_control_evaluators/__init__.py
index b1dabd9e..163a20b0 100644
--- a/evaluators/builtin/src/agent_control_evaluators/__init__.py
+++ b/evaluators/builtin/src/agent_control_evaluators/__init__.py
@@ -28,7 +28,12 @@
     __version__ = "0.0.0.dev"
 
 # Core infrastructure - export from _base and _registry
-from agent_control_evaluators._base import Evaluator, EvaluatorConfig, EvaluatorMetadata
+from agent_control_evaluators._base import (
+    EvaluationContext,
+    Evaluator,
+    EvaluatorConfig,
+    EvaluatorMetadata,
+)
 from agent_control_evaluators._discovery import (
     discover_evaluators,
     ensure_evaluators_discovered,
@@ -51,6 +56,7 @@
 
 __all__ = [
     # Core infrastructure
+    "EvaluationContext",
     "Evaluator",
     "EvaluatorConfig",
     "EvaluatorMetadata",
diff --git a/evaluators/builtin/src/agent_control_evaluators/_base.py b/evaluators/builtin/src/agent_control_evaluators/_base.py
index f5e6fc77..069b2e52 100644
--- a/evaluators/builtin/src/agent_control_evaluators/_base.py
+++ b/evaluators/builtin/src/agent_control_evaluators/_base.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar
 
-from agent_control_models import EvaluatorResult
+from agent_control_models import EvaluationContext, EvaluatorResult
 from agent_control_models.base import BaseModel
 
 if TYPE_CHECKING:
@@ -91,7 +91,25 @@ def __init__(self, config):
             async def evaluate(self, data):
                 self.call_count += 1  # BAD: race condition, leaks between requests
 
-    Example:
+    Runtime Context:
+        Most evaluators only need the selected ``data`` to decide. Some need
+        request-scoped context (the bound target, agent name, step type, etc.)
+        to call out to an external service or change behavior per call. The
+        contract is:
+
+        - ``evaluate(data)`` is the abstract entry point. Every subclass must
+          implement it. It is called by direct callers (tests, examples) and
+          serves as the canonical "no-context" path.
+        - ``evaluate_with_context(data, context)`` is what the engine calls.
+          Its default implementation delegates to ``evaluate(data)``, so
+          existing context-free evaluators work unchanged.
+
+        Evaluators that need context override ``evaluate_with_context`` and
+        either (a) reimplement ``evaluate`` as a delegate to the new method
+        with ``context=None`` (the Luna pattern, recommended for symmetry) or
+        (b) leave ``evaluate`` as a no-context fallback.
+
+    Example (context-free evaluator):
         ```python
         from agent_control_evaluators import (
             Evaluator,
@@ -120,6 +138,33 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
                     message="Evaluation complete"
                 )
         ```
+
+    Example (context-aware evaluator):
+        ```python
+        from agent_control_evaluators import (
+            EvaluationContext,
+            Evaluator,
+            EvaluatorConfig,
+            EvaluatorMetadata,
+            register_evaluator,
+        )
+        from agent_control_models import EvaluatorResult
+
+        @register_evaluator
+        class TargetAwareEvaluator(Evaluator[MyConfig]):
+            metadata = EvaluatorMetadata(name="target-aware", version="1.0.0", description="")
+            config_model = MyConfig
+
+            async def evaluate(self, data: Any) -> EvaluatorResult:
+                return await self.evaluate_with_context(data, context=None)
+
+            async def evaluate_with_context(
+                self, data: Any, context: EvaluationContext | None = None
+            ) -> EvaluatorResult:
+                target_id = context.target_id if context else None
+                # ... use target_id in the decision ...
+                return EvaluatorResult(matched=False, confidence=1.0, message="ok")
+        ```
     """
 
     metadata: ClassVar[EvaluatorMetadata]
@@ -160,6 +205,19 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
         """
         pass
 
+    async def evaluate_with_context(
+        self,
+        data: Any,
+        context: EvaluationContext | None = None,
+    ) -> EvaluatorResult:
+        """Evaluate data with optional runtime context.
+
+        Evaluators that need request-scoped metadata may override this method.
+        The default keeps existing evaluators source-compatible by delegating to
+        the original ``evaluate(data)`` contract.
+        """
+        return await self.evaluate(data)
+
     def get_timeout_seconds(self) -> float:
         """Get timeout in seconds from config or metadata default."""
         timeout_ms: int = getattr(self.config, "timeout_ms", self.metadata.timeout_ms)
diff --git a/evaluators/builtin/tests/test_base.py b/evaluators/builtin/tests/test_base.py
index 776a8d01..368fc75d 100644
--- a/evaluators/builtin/tests/test_base.py
+++ b/evaluators/builtin/tests/test_base.py
@@ -1,12 +1,21 @@
 """Tests for evaluator base classes.
 
-Architecture: Evaluators take config at __init__, evaluate() only takes data.
+Architecture:
+    - ``evaluate(data)`` is the abstract entry point every subclass implements.
+    - ``evaluate_with_context(data, context)`` is the context-aware entry the
+      engine uses; the default delegates to ``evaluate(data)`` so legacy
+      subclasses keep working without modification.
 """
 
 import pytest
 from typing import Any
 
-from agent_control_evaluators import Evaluator, EvaluatorConfig, EvaluatorMetadata
+from agent_control_evaluators import (
+    EvaluationContext,
+    Evaluator,
+    EvaluatorConfig,
+    EvaluatorMetadata,
+)
 from agent_control_models import EvaluatorResult
 
 
@@ -138,3 +147,110 @@ def test_cannot_instantiate_abstract_class(self):
         """Test that Evaluator cannot be instantiated directly."""
         with pytest.raises(TypeError, match="abstract"):
             Evaluator({})  # type: ignore
+
+
+class TestEvaluateWithContext:
+    """Tests for the context-aware entry point on the base Evaluator."""
+
+    @pytest.mark.asyncio
+    async def test_default_evaluate_with_context_delegates_to_evaluate(self):
+        """A subclass that only implements ``evaluate`` is still reachable
+        through ``evaluate_with_context``.
+        """
+        evaluator = MockEvaluator.from_dict({"should_match": True})
+
+        result = await evaluator.evaluate_with_context("payload")
+
+        # The legacy ``evaluate`` returns matched=True and stores the data
+        # in metadata. If the default fallback worked, those carry through.
+        assert result.matched is True
+        assert result.metadata["data"] == "payload"
+
+    @pytest.mark.asyncio
+    async def test_default_evaluate_with_context_ignores_context(self):
+        """The default forwarder drops the context when it calls ``evaluate``
+        — this is by design so legacy implementations are unaffected.
+        """
+        evaluator = MockEvaluator.from_dict({"should_match": False})
+
+        context = EvaluationContext(
+            target_type="log_stream",
+            target_id="ls-123",
+            agent_name="acme",
+            step_type="llm",
+        )
+
+        # Should not raise, even though MockEvaluator.evaluate has no kwargs
+        # for context. The default forwarder strips it.
+        result = await evaluator.evaluate_with_context("data", context)
+
+        assert result.matched is False
+        assert result.metadata["data"] == "data"
+
+    @pytest.mark.asyncio
+    async def test_subclass_can_override_evaluate_with_context(self):
+        """A subclass override of ``evaluate_with_context`` is preferred over
+        the default fallback when the engine calls it.
+        """
+
+        class ContextAwareConfig(EvaluatorConfig):
+            pass
+
+        class ContextAware(Evaluator[ContextAwareConfig]):
+            metadata = EvaluatorMetadata(
+                name="ctx-aware",
+                version="1.0.0",
+                description="",
+            )
+            config_model = ContextAwareConfig
+
+            async def evaluate(self, data: Any) -> EvaluatorResult:
+                # Canonical "no-context" delegate pattern.
+                return await self.evaluate_with_context(data, context=None)
+
+            async def evaluate_with_context(
+                self,
+                data: Any,
+                context: EvaluationContext | None = None,
+            ) -> EvaluatorResult:
+                target_id = context.target_id if context else "no-target"
+                return EvaluatorResult(
+                    matched=True,
+                    confidence=1.0,
+                    message=f"saw {target_id}",
+                )
+
+        evaluator = ContextAware.from_dict({})
+
+        ctx = EvaluationContext(target_type="log_stream", target_id="ls-7")
+        result = await evaluator.evaluate_with_context("data", ctx)
+        assert result.message == "saw ls-7"
+
+        # The Luna-pattern ``evaluate`` should also work as the no-context path.
+        result_no_ctx = await evaluator.evaluate("data")
+        assert result_no_ctx.message == "saw no-target"
+
+    @pytest.mark.asyncio
+    async def test_evaluation_context_defaults_are_none(self):
+        """All EvaluationContext fields default to None and the dataclass is
+        constructible with no arguments. Regression guard against orphan
+        fields that have no populator on the engine side.
+        """
+        ctx = EvaluationContext()
+        assert ctx.target_type is None
+        assert ctx.target_id is None
+        assert ctx.agent_name is None
+        assert ctx.step_type is None
+        # Confirm we did not silently keep namespace_key around; reading an
+        # unknown attribute should fail.
+        with pytest.raises(AttributeError):
+            _ = ctx.namespace_key  # type: ignore[attr-defined]
+
+    def test_evaluation_context_is_importable_from_evaluators_package(self):
+        """EvaluationContext is re-exported from agent_control_evaluators so
+        subclasses can colocate their imports.
+        """
+        from agent_control_evaluators import EvaluationContext as Reexported
+        from agent_control_models import EvaluationContext as Canonical
+
+        assert Reexported is Canonical
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index caca997e..6c2b7d61 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -86,16 +86,26 @@ class ScorerInvokeRequest(BaseModel):
 
     Attributes:
         inputs: Selected scorer input values.
+        logstream_id: Optional Galileo log stream identifier for runtime context.
         scorer_label: Preset, registered, or fine-tuned scorer label.
+        scorer_id: Optional Galileo scorer identifier.
+        scorer_version_id: Optional Galileo scorer version identifier.
         config: Optional scorer-specific configuration.
     """
 
-    scorer_label: str = Field(min_length=1)
     inputs: ScorerInvokeInputs
+    logstream_id: str | None = Field(default=None, min_length=1)
+    scorer_label: str | None = Field(default=None, min_length=1)
+    scorer_id: str | None = Field(default=None, min_length=1)
+    scorer_version_id: str | None = Field(default=None, min_length=1)
     config: JSONObject | None = None
 
     @model_validator(mode="after")
-    def ensure_input_or_output(self) -> ScorerInvokeRequest:
+    def ensure_required_values(self) -> ScorerInvokeRequest:
+        if not (self.scorer_label or self.scorer_id or self.scorer_version_id):
+            raise ValueError(
+                "One of scorer_label, scorer_id, or scorer_version_id must be set."
+            )
         if not (_has_value(self.inputs.query) or _has_value(self.inputs.response)):
             raise ValueError("Either inputs.query or inputs.response must be set.")
         return self
@@ -109,14 +119,14 @@ class ScorerInvokeResponse(BaseModel):
     """Response from Galileo Luna scorer invocation.
 
     Attributes:
-        scorer_label: Echoed scorer label.
+        scorer_label: Echoed scorer label, when returned.
         score: Raw scorer value.
         status: Invocation status.
         execution_time: Execution time in seconds, when returned.
         error_message: Error detail for non-success statuses.
     """
 
-    scorer_label: str
+    scorer_label: str | None = None
     score: JSONValue
     status: str = "unknown"
     execution_time: float | None = None
@@ -229,7 +239,10 @@ def _endpoint_and_headers(
     async def invoke(
         self,
         *,
-        scorer_label: str,
+        scorer_label: str | None = None,
+        scorer_id: str | None = None,
+        scorer_version_id: str | None = None,
+        logstream_id: str | None = None,
         input: JSONValue = None,
         output: JSONValue = None,
         config: JSONObject | None = None,
@@ -240,6 +253,9 @@ async def invoke(
 
         Args:
             scorer_label: Preset, registered, or fine-tuned scorer label.
+            scorer_id: Optional Galileo scorer identifier.
+            scorer_version_id: Optional Galileo scorer version identifier.
+            logstream_id: Optional Galileo log stream identifier for runtime context.
             input: Optional user/system prompt text.
             output: Optional model response text.
             config: Optional scorer-specific configuration.
@@ -255,11 +271,16 @@ async def invoke(
             httpx.HTTPStatusError: If the API returns an error status code.
             httpx.RequestError: If the request fails before a response is received.
         """
+        if not (scorer_label or scorer_id or scorer_version_id):
+            raise ValueError("At least one scorer identifier must be provided.")
         if not (_has_value(input) or _has_value(output)):
             raise ValueError("At least one of input or output must be provided.")
 
         request_body = ScorerInvokeRequest(
             scorer_label=scorer_label,
+            scorer_id=scorer_id,
+            scorer_version_id=scorer_version_id,
+            logstream_id=logstream_id,
             inputs=ScorerInvokeInputs(
                 query="" if input is None else input, response="" if output is None else output
             ),
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
index 0f0d86d5..1136ffbe 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
@@ -31,14 +31,36 @@ class LunaEvaluatorConfig(EvaluatorConfig):
     """Configuration for direct Luna scorer evaluation.
 
     Attributes:
+        logstream_id: Optional Galileo log stream identifier used as runtime context.
         scorer_label: Preset, registered, or fine-tuned scorer label.
+        scorer_id: Optional Galileo scorer identifier.
+        scorer_version_id: Optional Galileo scorer version identifier.
         threshold: Local threshold used by the evaluator for comparison.
         operator: Local comparison operator. Numeric operators use threshold as a number.
         scorer_config: Optional scorer-specific config sent as ``config``.
         timeout_ms: Request timeout in milliseconds.
     """
 
-    scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke")
+    logstream_id: str | None = Field(
+        default=None,
+        min_length=1,
+        description="Optional Galileo log stream identifier used as scorer runtime context.",
+    )
+    scorer_label: str | None = Field(
+        default=None,
+        min_length=1,
+        description="Luna scorer label to invoke.",
+    )
+    scorer_id: str | None = Field(
+        default=None,
+        min_length=1,
+        description="Optional Galileo scorer identifier to invoke.",
+    )
+    scorer_version_id: str | None = Field(
+        default=None,
+        min_length=1,
+        description="Optional Galileo scorer version identifier to invoke.",
+    )
     threshold: JSONValue = Field(
         default=0.5,
         description="Local threshold used to decide whether the control matches.",
@@ -63,6 +85,10 @@ class LunaEvaluatorConfig(EvaluatorConfig):
     @model_validator(mode="after")
     def validate_threshold(self) -> LunaEvaluatorConfig:
         """Validate threshold compatibility with the configured operator."""
+        if not (self.scorer_label or self.scorer_id or self.scorer_version_id):
+            raise ValueError(
+                "one of scorer_label, scorer_id, or scorer_version_id is required"
+            )
         if self.operator in _NUMERIC_OPERATORS and coerce_number(self.threshold) is None:
             raise ValueError(f"operator '{self.operator}' requires a numeric threshold")
         if self.operator != "any" and self.threshold is None:
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
index 15798074..79749a84 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -9,7 +9,7 @@
 from typing import Any
 
 from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator
-from agent_control_models import EvaluatorResult, JSONValue
+from agent_control_models import EvaluationContext, EvaluatorResult, JSONValue
 
 from .client import GalileoLunaClient, ScorerInvokeResponse
 from .config import LunaEvaluatorConfig, coerce_number
@@ -76,6 +76,10 @@ def _confidence_from_score(score: JSONValue) -> float:
     return 1.0
 
 
+def _is_logstream_target(target_type: str | None) -> bool:
+    return (target_type or "").lower() in {"logstream", "log_stream", "log-stream"}
+
+
 @register_evaluator
 class LunaEvaluator(Evaluator[LunaEvaluatorConfig]):
     """Galileo Luna evaluator using the direct scorer invocation API."""
@@ -133,7 +137,8 @@ def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]:
                 return input_text, output_text
 
         text = _coerce_payload_text(data)
-        if "output" in self.config.scorer_label:
+        scorer_label = self.config.scorer_label or ""
+        if "output" in scorer_label:
             return None, text
         return text, None
 
@@ -170,26 +175,37 @@ def _score_matches(self, score: JSONValue) -> bool:
         raise ValueError(f"Unsupported Luna operator: {operator}")
 
     async def evaluate(self, data: Any) -> EvaluatorResult:
+        """Evaluate selected data without runtime context."""
+        return await self.evaluate_with_context(data, context=None)
+
+    async def evaluate_with_context(
+        self,
+        data: Any,
+        context: EvaluationContext | None = None,
+    ) -> EvaluatorResult:
         """Evaluate selected data with Galileo Luna direct scorer invocation.
 
         Args:
             data: The data selected from the runtime step.
+            context: Optional runtime context from the engine.
 
         Returns:
             EvaluatorResult with local threshold decision and scorer metadata.
         """
         input_text, output_text = self._prepare_payload(data)
+        logstream_id = self._resolve_logstream_id(context)
         if not (_has_text(input_text) or _has_text(output_text)):
             return EvaluatorResult(
                 matched=False,
                 confidence=1.0,
                 message="No data to score with Luna",
-                metadata={"scorer_label": self.config.scorer_label},
+                metadata=self._base_metadata(logstream_id=logstream_id),
             )
 
         try:
+            scorer_kwargs = self._scorer_kwargs(logstream_id=logstream_id)
             response = await self._get_client().invoke(
-                scorer_label=self.config.scorer_label,
+                **scorer_kwargs,
                 input=input_text if _has_text(input_text) else None,
                 output=output_text if _has_text(output_text) else None,
                 config=self.config.scorer_config,
@@ -201,7 +217,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
                 raise RuntimeError(message)
 
             matched = self._score_matches(response.score)
-            metadata = self._metadata(response)
+            metadata = self._metadata(response, logstream_id=logstream_id)
             operator = self.config.operator
             threshold = self.config.threshold
             state = "triggered" if matched else "not triggered"
@@ -216,10 +232,39 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
             )
         except Exception as exc:
             logger.error("Luna evaluation error: %s", exc, exc_info=True)
-            return self._handle_error(exc)
-
-    def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]:
-        metadata: dict[str, Any] = {
+            return self._handle_error(exc, logstream_id=logstream_id)
+
+    def _resolve_logstream_id(self, context: EvaluationContext | None) -> str | None:
+        if context is not None and _is_logstream_target(context.target_type):
+            return context.target_id
+        return self.config.logstream_id
+
+    def _base_metadata(self, *, logstream_id: str | None = None) -> dict[str, Any]:
+        metadata = {
+            "logstream_id": logstream_id,
+            "scorer_label": self.config.scorer_label,
+            "scorer_id": self.config.scorer_id,
+            "scorer_version_id": self.config.scorer_version_id,
+        }
+        return {key: value for key, value in metadata.items() if value is not None}
+
+    def _scorer_kwargs(self, *, logstream_id: str | None = None) -> dict[str, Any]:
+        kwargs = {
+            "logstream_id": logstream_id,
+            "scorer_label": self.config.scorer_label,
+            "scorer_id": self.config.scorer_id,
+            "scorer_version_id": self.config.scorer_version_id,
+        }
+        return {key: value for key, value in kwargs.items() if value is not None}
+
+    def _metadata(
+        self,
+        response: ScorerInvokeResponse,
+        *,
+        logstream_id: str | None = None,
+    ) -> dict[str, Any]:
+        metadata: dict[str, Any] = self._base_metadata(logstream_id=logstream_id)
+        metadata.update({
             "scorer_label": response.scorer_label or self.config.scorer_label,
             "score": response.score,
             "threshold": self.config.threshold,
@@ -227,10 +272,15 @@ def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]:
             "status": response.status,
             "execution_time_seconds": response.execution_time,
             "error_message": response.error_message,
-        }
+        })
         return metadata
 
-    def _handle_error(self, error: Exception) -> EvaluatorResult:
+    def _handle_error(
+        self,
+        error: Exception,
+        *,
+        logstream_id: str | None = None,
+    ) -> EvaluatorResult:
         error_detail = str(error)
         return EvaluatorResult(
             matched=False,
@@ -240,6 +290,9 @@ def _handle_error(self, error: Exception) -> EvaluatorResult:
                 "error": error_detail,
                 "error_type": type(error).__name__,
                 "scorer_label": self.config.scorer_label,
+                "scorer_id": self.config.scorer_id,
+                "scorer_version_id": self.config.scorer_version_id,
+                "logstream_id": logstream_id,
             },
             error=error_detail,
         )
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 4e1f45b8..201d9f73 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -27,18 +27,38 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
 
         # Given: a direct scorer config with local thresholding
         config = LunaEvaluatorConfig(
+            logstream_id="logstream-123",
             scorer_label="toxicity",
+            scorer_id="scorer-123",
+            scorer_version_id="version-123",
             threshold=0.7,
             operator="gte",
             config={"temperature": 0},
         )
 
         # Then: config is retained without Protect concepts
+        assert config.logstream_id == "logstream-123"
         assert config.scorer_label == "toxicity"
+        assert config.scorer_id == "scorer-123"
+        assert config.scorer_version_id == "version-123"
         assert config.threshold == 0.7
         assert config.operator == "gte"
         assert config.scorer_config == {"temperature": 0}
 
+    def test_config_accepts_scorer_id_without_label(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig
+
+        config = LunaEvaluatorConfig(scorer_id="scorer-123")
+
+        assert config.scorer_id == "scorer-123"
+        assert config.scorer_label is None
+
+    def test_config_requires_a_scorer_identifier(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig
+
+        with pytest.raises(ValidationError, match="one of scorer_label"):
+            LunaEvaluatorConfig(threshold=0.5)
+
     def test_numeric_operator_requires_numeric_threshold(self) -> None:
         from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig
 
@@ -55,14 +75,20 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None:
 
         # Given: a scorer request with scorer config
         request = ScorerInvokeRequest(
+            logstream_id="logstream-123",
             scorer_label="toxicity",
+            scorer_id="scorer-123",
+            scorer_version_id="version-123",
             inputs=ScorerInvokeInputs(query={"messages": [{"role": "user", "content": "hello"}]}),
             config={"top_k": 1},
         )
 
         # Then: the serialized payload uses the API-owned scorer invoke fields
         assert request.to_dict() == {
+            "logstream_id": "logstream-123",
             "scorer_label": "toxicity",
+            "scorer_id": "scorer-123",
+            "scorer_version_id": "version-123",
             "inputs": {
                 "query": {"messages": [{"role": "user", "content": "hello"}]},
                 "response": "",
@@ -383,6 +409,46 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
             timeout=5.0,
         )
 
+    @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
+    @pytest.mark.asyncio
+    async def test_evaluator_passes_logstream_id_from_runtime_context(self) -> None:
+        from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse
+        from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+        from agent_control_models import EvaluationContext
+
+        evaluator = LunaEvaluator.from_dict(
+            {
+                "logstream_id": "config-logstream",
+                "scorer_label": "toxicity",
+                "scorer_id": "scorer-123",
+                "scorer_version_id": "version-123",
+            }
+        )
+
+        with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
+            mock_invoke.return_value = ScorerInvokeResponse(
+                scorer_label="toxicity",
+                score=0.82,
+                status="success",
+            )
+
+            result = await evaluator.evaluate_with_context(
+                "hello",
+                EvaluationContext(target_type="log_stream", target_id="runtime-logstream"),
+            )
+
+        assert result.metadata["logstream_id"] == "runtime-logstream"
+        mock_invoke.assert_awaited_once_with(
+            logstream_id="runtime-logstream",
+            scorer_label="toxicity",
+            scorer_id="scorer-123",
+            scorer_version_id="version-123",
+            input="hello",
+            output=None,
+            config=None,
+            timeout=10.0,
+        )
+
     @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
     @pytest.mark.asyncio
     async def test_evaluator_returns_non_match_below_threshold(self) -> None:
diff --git a/models/src/agent_control_models/__init__.py b/models/src/agent_control_models/__init__.py
index 148cdd7a..867367bd 100644
--- a/models/src/agent_control_models/__init__.py
+++ b/models/src/agent_control_models/__init__.py
@@ -61,6 +61,7 @@
     make_error_type,
 )
 from .evaluation import (
+    EvaluationContext,
     EvaluationRequest,
     EvaluationResponse,
     EvaluationResult,
@@ -132,6 +133,7 @@
     # Policy
     "Policy",
     # Evaluation
+    "EvaluationContext",
     "EvaluationRequest",
     "EvaluationResponse",
     "EvaluationResult",
diff --git a/models/src/agent_control_models/evaluation.py b/models/src/agent_control_models/evaluation.py
index 50b5791b..5c81b0c1 100644
--- a/models/src/agent_control_models/evaluation.py
+++ b/models/src/agent_control_models/evaluation.py
@@ -1,4 +1,5 @@
 """Evaluation-related models."""
+from dataclasses import dataclass
 from typing import Annotated, Literal, Self
 
 from pydantic import Field, StringConstraints, field_validator, model_validator
@@ -13,6 +14,22 @@
 ]
 
 
+@dataclass
+class EvaluationContext:
+    """Runtime context available while evaluating a control.
+
+    This is intentionally small and mutable by normal dataclass semantics so
+    downstream users can subclass it with richer runtime context when needed.
+    Only fields the engine actually populates today are declared here; add new
+    fields only when there is a populator on every supported call path.
+    """
+
+    target_type: str | None = None
+    target_id: str | None = None
+    agent_name: str | None = None
+    step_type: str | None = None
+
+
 class EvaluationRequest(BaseModel):
     """
     Request model for evaluation analysis.
diff --git a/sdks/python/src/agent_control/integrations/google_adk/plugin.py b/sdks/python/src/agent_control/integrations/google_adk/plugin.py
index eb2155c8..28e59698 100644
--- a/sdks/python/src/agent_control/integrations/google_adk/plugin.py
+++ b/sdks/python/src/agent_control/integrations/google_adk/plugin.py
@@ -22,11 +22,18 @@
 from agent_control.validation import ensure_agent_name
 
 try:
-    from google.adk.agents.callback_context import CallbackContext  # type: ignore[import-not-found]
-    from google.adk.models import LlmRequest, LlmResponse  # type: ignore[import-not-found]
-    from google.adk.plugins import BasePlugin  # type: ignore[import-not-found]
-    from google.adk.tools import BaseTool  # type: ignore[import-not-found]
-    from google.adk.tools.tool_context import ToolContext  # type: ignore[import-not-found]
+    from google.adk.agents.callback_context import (  # type: ignore[import-not-found,import-untyped]
+        CallbackContext,
+    )
+    from google.adk.models import (  # type: ignore[import-not-found,import-untyped]
+        LlmRequest,
+        LlmResponse,
+    )
+    from google.adk.plugins import BasePlugin  # type: ignore[import-not-found,import-untyped]
+    from google.adk.tools import BaseTool  # type: ignore[import-not-found,import-untyped]
+    from google.adk.tools.tool_context import (  # type: ignore[import-not-found,import-untyped]
+        ToolContext,
+    )
 except Exception as exc:  # pragma: no cover - optional dependency
     raise RuntimeError(
         "Google ADK integration requires google-adk. "

From 15e7a01c1f9a76430cce2870a8fb72bf80796303 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Fri, 15 May 2026 12:57:42 -0700
Subject: [PATCH 13/20] remove evaluation context

---
 engine/src/agent_control_engine/core.py       |   9 +-
 engine/tests/test_core.py                     | 242 ++++--------------
 .../src/agent_control_evaluators/__init__.py  |   2 -
 .../src/agent_control_evaluators/_base.py     |  61 +----
 evaluators/builtin/tests/test_base.py         | 120 +--------
 .../luna/client.py                            |   5 -
 .../luna/config.py                            |   6 -
 .../luna/evaluator.py                         |  42 +--
 .../galileo/tests/test_luna_evaluator.py      |  44 ----
 models/src/agent_control_models/__init__.py   |   2 -
 models/src/agent_control_models/evaluation.py |  17 --
 11 files changed, 56 insertions(+), 494 deletions(-)

diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py
index 0a8f1864..99c2273b 100644
--- a/engine/src/agent_control_engine/core.py
+++ b/engine/src/agent_control_engine/core.py
@@ -18,7 +18,6 @@
     ControlAction,
     ControlMatch,
     ControlScope,
-    EvaluationContext,
     EvaluationRequest,
     EvaluationResponse,
     EvaluatorResult,
@@ -189,14 +188,8 @@ async def _evaluate_leaf(
                 if timeout <= 0:
                     timeout = DEFAULT_EVALUATOR_TIMEOUT
 
-                context = EvaluationContext(
-                    target_type=request.target_type,
-                    target_id=request.target_id,
-                    agent_name=request.agent_name,
-                    step_type=request.step.type,
-                )
                 result = await asyncio.wait_for(
-                    evaluator.evaluate_with_context(data, context),
+                    evaluator.evaluate(data),
                     timeout=timeout,
                 )
         except TimeoutError:
diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py
index 37030998..78eda0ab 100644
--- a/engine/tests/test_core.py
+++ b/engine/tests/test_core.py
@@ -831,7 +831,13 @@ async def test_confidence_is_full_on_deny_match(self):
         controls = [
             make_control(1, "denier", "test-deny", action="deny", config_value="d"),
         ] + [
-            make_control(i + 2, f"blocker{i}", "test-blocker", action="observe", config_value=str(i))
+            make_control(
+                i + 2,
+                f"blocker{i}",
+                "test-blocker",
+                action="observe",
+                config_value=str(i),
+            )
             for i in range(9)
         ]
         engine = ControlEngine(controls)
@@ -1830,10 +1836,20 @@ async def test_server_context_only_runs_server_controls(self):
         """
         controls = [
             make_control_with_execution(
-                1, "local_ctrl", "test-allow", action="observe", config_value="loc", execution="sdk"
+                1,
+                "local_ctrl",
+                "test-allow",
+                action="observe",
+                config_value="loc",
+                execution="sdk",
             ),
             make_control_with_execution(
-                2, "server_ctrl", "test-allow", action="observe", config_value="srv", execution="server"
+                2,
+                "server_ctrl",
+                "test-allow",
+                action="observe",
+                config_value="srv",
+                execution="server",
             ),
         ]
         engine = ControlEngine(controls, context="server")
@@ -1860,10 +1876,20 @@ async def test_sdk_context_only_runs_sdk_controls(self):
         """
         controls = [
             make_control_with_execution(
-                1, "local_ctrl", "test-allow", action="observe", config_value="loc", execution="sdk"
+                1,
+                "local_ctrl",
+                "test-allow",
+                action="observe",
+                config_value="loc",
+                execution="sdk",
             ),
             make_control_with_execution(
-                2, "server_ctrl", "test-allow", action="observe", config_value="srv", execution="server"
+                2,
+                "server_ctrl",
+                "test-allow",
+                action="observe",
+                config_value="srv",
+                execution="server",
             ),
         ]
         engine = ControlEngine(controls, context="sdk")
@@ -1890,10 +1916,20 @@ async def test_default_context_is_server(self):
         """
         controls = [
             make_control_with_execution(
-                1, "local_ctrl", "test-allow", action="observe", config_value="loc", execution="sdk"
+                1,
+                "local_ctrl",
+                "test-allow",
+                action="observe",
+                config_value="loc",
+                execution="sdk",
             ),
             make_control_with_execution(
-                2, "server_ctrl", "test-allow", action="observe", config_value="srv", execution="server"
+                2,
+                "server_ctrl",
+                "test-allow",
+                action="observe",
+                config_value="srv",
+                execution="server",
             ),
         ]
         engine = ControlEngine(controls)  # No context param
@@ -2353,195 +2389,3 @@ class MockControl:
         assert selector_errors[0].result.error is not None
         assert "Invalid step_name_regex" in selector_errors[0].result.error
         assert "[invalid(regex" in selector_errors[0].result.message
-
-
-# =============================================================================
-# Test: EvaluationContext Propagation
-# =============================================================================
-
-
-_observed_contexts: list[Any] = []
-
-
-def _reset_observed_contexts() -> None:
-    _observed_contexts.clear()
-
-
-class ContextRecordingEvaluator(Evaluator[SimpleConfig]):
-    """Evaluator that overrides evaluate_with_context to record the context.
-
-    Verifies that the engine populates EvaluationContext correctly. Records
-    every (data, context) pair it observes so concurrent invocations on a
-    shared instance can be inspected.
-    """
-
-    metadata = EvaluatorMetadata(
-        name="test-context-recorder",
-        version="1.0.0",
-        description="Records EvaluationContext",
-    )
-    config_model = SimpleConfig
-
-    async def evaluate(self, data: Any) -> EvaluatorResult:
-        # Should not be hit when the engine routes through evaluate_with_context.
-        _observed_contexts.append(("evaluate-fallback", data, None))
-        return EvaluatorResult(matched=False, confidence=1.0, message="fallback")
-
-    async def evaluate_with_context(
-        self,
-        data: Any,
-        context: Any = None,
-    ) -> EvaluatorResult:
-        _observed_contexts.append(("with-context", data, context))
-        return EvaluatorResult(matched=False, confidence=1.0, message="ok")
-
-
-class LegacyOnlyEvaluator(Evaluator[SimpleConfig]):
-    """Evaluator that overrides only evaluate(data), proving the default
-    evaluate_with_context fallback routes back to it.
-    """
-
-    metadata = EvaluatorMetadata(
-        name="test-legacy-only",
-        version="1.0.0",
-        description="Legacy signature only",
-    )
-    config_model = SimpleConfig
-
-    async def evaluate(self, data: Any) -> EvaluatorResult:
-        _observed_contexts.append(("legacy-evaluate", data, None))
-        return EvaluatorResult(matched=False, confidence=1.0, message="legacy")
-
-
-class TestEvaluationContextPropagation:
-    """Verify the engine populates and forwards EvaluationContext correctly."""
-
-    @pytest.fixture(autouse=True)
-    def _setup(self):
-        _reset_observed_contexts()
-        # Register the local fixtures (idempotent).
-        for cls in (ContextRecordingEvaluator, LegacyOnlyEvaluator):
-            try:
-                register_evaluator(cls)
-            except ValueError:
-                pass
-        yield
-        _reset_observed_contexts()
-
-    @pytest.mark.asyncio
-    async def test_engine_populates_context_from_request(self):
-        """Engine builds an EvaluationContext from the request and passes it
-        to ``evaluate_with_context``.
-        """
-        from agent_control_models import EvaluationContext
-
-        controls = [
-            make_control(1, "ctx1", "test-context-recorder", action="observe"),
-        ]
-        engine = ControlEngine(controls)
-
-        request = EvaluationRequest(
-            agent_name="00000000-0000-0000-0000-000000000001",
-            step=Step(type="llm", name="step-x", input="hello", output=None),
-            stage="pre",
-            target_type="log_stream",
-            target_id="ls-42",
-        )
-        await engine.process(request)
-
-        with_context_observations = [
-            entry for entry in _observed_contexts if entry[0] == "with-context"
-        ]
-        assert len(with_context_observations) == 1, _observed_contexts
-        _, _, context = with_context_observations[0]
-        assert isinstance(context, EvaluationContext)
-        assert context.target_type == "log_stream"
-        assert context.target_id == "ls-42"
-        assert context.agent_name == "00000000-0000-0000-0000-000000000001"
-        assert context.step_type == "llm"
-
-    @pytest.mark.asyncio
-    async def test_engine_passes_context_with_none_target_when_unbound(self):
-        """When the request carries no target binding, target_* on the
-        context are None but the context object is still supplied.
-        """
-        from agent_control_models import EvaluationContext
-
-        controls = [
-            make_control(1, "ctx1", "test-context-recorder", action="observe"),
-        ]
-        engine = ControlEngine(controls)
-
-        request = EvaluationRequest(
-            agent_name="00000000-0000-0000-0000-000000000001",
-            step=Step(type="llm", name="step-y", input="x", output=None),
-            stage="pre",
-            # No target_type / target_id.
-        )
-        await engine.process(request)
-
-        with_context = [e for e in _observed_contexts if e[0] == "with-context"]
-        assert len(with_context) == 1
-        context = with_context[0][2]
-        assert isinstance(context, EvaluationContext)
-        assert context.target_type is None
-        assert context.target_id is None
-        assert context.step_type == "llm"
-
-    @pytest.mark.asyncio
-    async def test_legacy_evaluator_still_works_via_default_fallback(self):
-        """Subclasses overriding only ``evaluate(data)`` keep working: the
-        base ``evaluate_with_context`` default routes back to them.
-        """
-        controls = [
-            make_control(1, "ctx1", "test-legacy-only", action="observe"),
-        ]
-        engine = ControlEngine(controls)
-
-        request = EvaluationRequest(
-            agent_name="00000000-0000-0000-0000-000000000001",
-            step=Step(type="llm", name="step-z", input="hello", output=None),
-            stage="pre",
-            target_type="log_stream",
-            target_id="ls-99",
-        )
-        await engine.process(request)
-
-        legacy_calls = [e for e in _observed_contexts if e[0] == "legacy-evaluate"]
-        assert len(legacy_calls) == 1, _observed_contexts
-        # The legacy entry point receives data only; no context object is
-        # observed because the default forwarder drops it to call ``evaluate``.
-        _, data, _ = legacy_calls[0]
-        assert data == "hello"
-
-    @pytest.mark.asyncio
-    async def test_concurrent_requests_receive_distinct_contexts(self):
-        """A cached instance must observe per-call context, not a shared one."""
-        from agent_control_models import EvaluationContext
-
-        controls = [
-            make_control(1, "ctx1", "test-context-recorder", action="observe"),
-        ]
-        engine = ControlEngine(controls)
-
-        async def fire(target_id: str) -> None:
-            request = EvaluationRequest(
-                agent_name="00000000-0000-0000-0000-000000000001",
-                step=Step(
-                    type="llm", name=f"step-{target_id}", input="hi", output=None
-                ),
-                stage="pre",
-                target_type="log_stream",
-                target_id=target_id,
-            )
-            await engine.process(request)
-
-        await asyncio.gather(*(fire(f"ls-{i}") for i in range(5)))
-
-        with_context = [e for e in _observed_contexts if e[0] == "with-context"]
-        assert len(with_context) == 5
-        observed_target_ids = sorted(
-            context.target_id for _, _, context in with_context
-            if isinstance(context, EvaluationContext)
-        )
-        assert observed_target_ids == ["ls-0", "ls-1", "ls-2", "ls-3", "ls-4"]
diff --git a/evaluators/builtin/src/agent_control_evaluators/__init__.py b/evaluators/builtin/src/agent_control_evaluators/__init__.py
index 163a20b0..d435d801 100644
--- a/evaluators/builtin/src/agent_control_evaluators/__init__.py
+++ b/evaluators/builtin/src/agent_control_evaluators/__init__.py
@@ -29,7 +29,6 @@
 
 # Core infrastructure - export from _base and _registry
 from agent_control_evaluators._base import (
-    EvaluationContext,
     Evaluator,
     EvaluatorConfig,
     EvaluatorMetadata,
@@ -56,7 +55,6 @@
 
 __all__ = [
     # Core infrastructure
-    "EvaluationContext",
     "Evaluator",
     "EvaluatorConfig",
     "EvaluatorMetadata",
diff --git a/evaluators/builtin/src/agent_control_evaluators/_base.py b/evaluators/builtin/src/agent_control_evaluators/_base.py
index 069b2e52..bf36f8c1 100644
--- a/evaluators/builtin/src/agent_control_evaluators/_base.py
+++ b/evaluators/builtin/src/agent_control_evaluators/_base.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar
 
-from agent_control_models import EvaluationContext, EvaluatorResult
+from agent_control_models import EvaluatorResult
 from agent_control_models.base import BaseModel
 
 if TYPE_CHECKING:
@@ -91,25 +91,7 @@ def __init__(self, config):
             async def evaluate(self, data):
                 self.call_count += 1  # BAD: race condition, leaks between requests
 
-    Runtime Context:
-        Most evaluators only need the selected ``data`` to decide. Some need
-        request-scoped context (the bound target, agent name, step type, etc.)
-        to call out to an external service or change behavior per call. The
-        contract is:
-
-        - ``evaluate(data)`` is the abstract entry point. Every subclass must
-          implement it. It is called by direct callers (tests, examples) and
-          serves as the canonical "no-context" path.
-        - ``evaluate_with_context(data, context)`` is what the engine calls.
-          Its default implementation delegates to ``evaluate(data)``, so
-          existing context-free evaluators work unchanged.
-
-        Evaluators that need context override ``evaluate_with_context`` and
-        either (a) reimplement ``evaluate`` as a delegate to the new method
-        with ``context=None`` (the Luna pattern, recommended for symmetry) or
-        (b) leave ``evaluate`` as a no-context fallback.
-
-    Example (context-free evaluator):
+    Example:
         ```python
         from agent_control_evaluators import (
             Evaluator,
@@ -139,32 +121,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
                 )
         ```
 
-    Example (context-aware evaluator):
-        ```python
-        from agent_control_evaluators import (
-            EvaluationContext,
-            Evaluator,
-            EvaluatorConfig,
-            EvaluatorMetadata,
-            register_evaluator,
-        )
-        from agent_control_models import EvaluatorResult
-
-        @register_evaluator
-        class TargetAwareEvaluator(Evaluator[MyConfig]):
-            metadata = EvaluatorMetadata(name="target-aware", version="1.0.0", description="")
-            config_model = MyConfig
-
-            async def evaluate(self, data: Any) -> EvaluatorResult:
-                return await self.evaluate_with_context(data, context=None)
-
-            async def evaluate_with_context(
-                self, data: Any, context: EvaluationContext | None = None
-            ) -> EvaluatorResult:
-                target_id = context.target_id if context else None
-                # ... use target_id in the decision ...
-                return EvaluatorResult(matched=False, confidence=1.0, message="ok")
-        ```
     """
 
     metadata: ClassVar[EvaluatorMetadata]
@@ -205,19 +161,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
         """
         pass
 
-    async def evaluate_with_context(
-        self,
-        data: Any,
-        context: EvaluationContext | None = None,
-    ) -> EvaluatorResult:
-        """Evaluate data with optional runtime context.
-
-        Evaluators that need request-scoped metadata may override this method.
-        The default keeps existing evaluators source-compatible by delegating to
-        the original ``evaluate(data)`` contract.
-        """
-        return await self.evaluate(data)
-
     def get_timeout_seconds(self) -> float:
         """Get timeout in seconds from config or metadata default."""
         timeout_ms: int = getattr(self.config, "timeout_ms", self.metadata.timeout_ms)
diff --git a/evaluators/builtin/tests/test_base.py b/evaluators/builtin/tests/test_base.py
index 368fc75d..776a8d01 100644
--- a/evaluators/builtin/tests/test_base.py
+++ b/evaluators/builtin/tests/test_base.py
@@ -1,21 +1,12 @@
 """Tests for evaluator base classes.
 
-Architecture:
-    - ``evaluate(data)`` is the abstract entry point every subclass implements.
-    - ``evaluate_with_context(data, context)`` is the context-aware entry the
-      engine uses; the default delegates to ``evaluate(data)`` so legacy
-      subclasses keep working without modification.
+Architecture: Evaluators take config at __init__, evaluate() only takes data.
 """
 
 import pytest
 from typing import Any
 
-from agent_control_evaluators import (
-    EvaluationContext,
-    Evaluator,
-    EvaluatorConfig,
-    EvaluatorMetadata,
-)
+from agent_control_evaluators import Evaluator, EvaluatorConfig, EvaluatorMetadata
 from agent_control_models import EvaluatorResult
 
 
@@ -147,110 +138,3 @@ def test_cannot_instantiate_abstract_class(self):
         """Test that Evaluator cannot be instantiated directly."""
         with pytest.raises(TypeError, match="abstract"):
             Evaluator({})  # type: ignore
-
-
-class TestEvaluateWithContext:
-    """Tests for the context-aware entry point on the base Evaluator."""
-
-    @pytest.mark.asyncio
-    async def test_default_evaluate_with_context_delegates_to_evaluate(self):
-        """A subclass that only implements ``evaluate`` is still reachable
-        through ``evaluate_with_context``.
-        """
-        evaluator = MockEvaluator.from_dict({"should_match": True})
-
-        result = await evaluator.evaluate_with_context("payload")
-
-        # The legacy ``evaluate`` returns matched=True and stores the data
-        # in metadata. If the default fallback worked, those carry through.
-        assert result.matched is True
-        assert result.metadata["data"] == "payload"
-
-    @pytest.mark.asyncio
-    async def test_default_evaluate_with_context_ignores_context(self):
-        """The default forwarder drops the context when it calls ``evaluate``
-        — this is by design so legacy implementations are unaffected.
-        """
-        evaluator = MockEvaluator.from_dict({"should_match": False})
-
-        context = EvaluationContext(
-            target_type="log_stream",
-            target_id="ls-123",
-            agent_name="acme",
-            step_type="llm",
-        )
-
-        # Should not raise, even though MockEvaluator.evaluate has no kwargs
-        # for context. The default forwarder strips it.
-        result = await evaluator.evaluate_with_context("data", context)
-
-        assert result.matched is False
-        assert result.metadata["data"] == "data"
-
-    @pytest.mark.asyncio
-    async def test_subclass_can_override_evaluate_with_context(self):
-        """A subclass override of ``evaluate_with_context`` is preferred over
-        the default fallback when the engine calls it.
-        """
-
-        class ContextAwareConfig(EvaluatorConfig):
-            pass
-
-        class ContextAware(Evaluator[ContextAwareConfig]):
-            metadata = EvaluatorMetadata(
-                name="ctx-aware",
-                version="1.0.0",
-                description="",
-            )
-            config_model = ContextAwareConfig
-
-            async def evaluate(self, data: Any) -> EvaluatorResult:
-                # Canonical "no-context" delegate pattern.
-                return await self.evaluate_with_context(data, context=None)
-
-            async def evaluate_with_context(
-                self,
-                data: Any,
-                context: EvaluationContext | None = None,
-            ) -> EvaluatorResult:
-                target_id = context.target_id if context else "no-target"
-                return EvaluatorResult(
-                    matched=True,
-                    confidence=1.0,
-                    message=f"saw {target_id}",
-                )
-
-        evaluator = ContextAware.from_dict({})
-
-        ctx = EvaluationContext(target_type="log_stream", target_id="ls-7")
-        result = await evaluator.evaluate_with_context("data", ctx)
-        assert result.message == "saw ls-7"
-
-        # The Luna-pattern ``evaluate`` should also work as the no-context path.
-        result_no_ctx = await evaluator.evaluate("data")
-        assert result_no_ctx.message == "saw no-target"
-
-    @pytest.mark.asyncio
-    async def test_evaluation_context_defaults_are_none(self):
-        """All EvaluationContext fields default to None and the dataclass is
-        constructible with no arguments. Regression guard against orphan
-        fields that have no populator on the engine side.
-        """
-        ctx = EvaluationContext()
-        assert ctx.target_type is None
-        assert ctx.target_id is None
-        assert ctx.agent_name is None
-        assert ctx.step_type is None
-        # Confirm we did not silently keep namespace_key around; reading an
-        # unknown attribute should fail.
-        with pytest.raises(AttributeError):
-            _ = ctx.namespace_key  # type: ignore[attr-defined]
-
-    def test_evaluation_context_is_importable_from_evaluators_package(self):
-        """EvaluationContext is re-exported from agent_control_evaluators so
-        subclasses can colocate their imports.
-        """
-        from agent_control_evaluators import EvaluationContext as Reexported
-        from agent_control_models import EvaluationContext as Canonical
-
-        assert Reexported is Canonical
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index 6c2b7d61..51d34c96 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -86,7 +86,6 @@ class ScorerInvokeRequest(BaseModel):
 
     Attributes:
         inputs: Selected scorer input values.
-        logstream_id: Optional Galileo log stream identifier for runtime context.
         scorer_label: Preset, registered, or fine-tuned scorer label.
         scorer_id: Optional Galileo scorer identifier.
         scorer_version_id: Optional Galileo scorer version identifier.
@@ -94,7 +93,6 @@ class ScorerInvokeRequest(BaseModel):
     """
 
     inputs: ScorerInvokeInputs
-    logstream_id: str | None = Field(default=None, min_length=1)
     scorer_label: str | None = Field(default=None, min_length=1)
     scorer_id: str | None = Field(default=None, min_length=1)
     scorer_version_id: str | None = Field(default=None, min_length=1)
@@ -242,7 +240,6 @@ async def invoke(
         scorer_label: str | None = None,
         scorer_id: str | None = None,
         scorer_version_id: str | None = None,
-        logstream_id: str | None = None,
         input: JSONValue = None,
         output: JSONValue = None,
         config: JSONObject | None = None,
@@ -255,7 +252,6 @@ async def invoke(
             scorer_label: Preset, registered, or fine-tuned scorer label.
             scorer_id: Optional Galileo scorer identifier.
             scorer_version_id: Optional Galileo scorer version identifier.
-            logstream_id: Optional Galileo log stream identifier for runtime context.
             input: Optional user/system prompt text.
             output: Optional model response text.
             config: Optional scorer-specific configuration.
@@ -280,7 +276,6 @@ async def invoke(
             scorer_label=scorer_label,
             scorer_id=scorer_id,
             scorer_version_id=scorer_version_id,
-            logstream_id=logstream_id,
             inputs=ScorerInvokeInputs(
                 query="" if input is None else input, response="" if output is None else output
             ),
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
index 1136ffbe..c49dd716 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
@@ -31,7 +31,6 @@ class LunaEvaluatorConfig(EvaluatorConfig):
     """Configuration for direct Luna scorer evaluation.
 
     Attributes:
-        logstream_id: Optional Galileo log stream identifier used as runtime context.
         scorer_label: Preset, registered, or fine-tuned scorer label.
         scorer_id: Optional Galileo scorer identifier.
         scorer_version_id: Optional Galileo scorer version identifier.
@@ -41,11 +40,6 @@ class LunaEvaluatorConfig(EvaluatorConfig):
         timeout_ms: Request timeout in milliseconds.
     """
 
-    logstream_id: str | None = Field(
-        default=None,
-        min_length=1,
-        description="Optional Galileo log stream identifier used as scorer runtime context.",
-    )
     scorer_label: str | None = Field(
         default=None,
         min_length=1,
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
index 79749a84..ce46cf44 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -9,7 +9,7 @@
 from typing import Any
 
 from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator
-from agent_control_models import EvaluationContext, EvaluatorResult, JSONValue
+from agent_control_models import EvaluatorResult, JSONValue
 
 from .client import GalileoLunaClient, ScorerInvokeResponse
 from .config import LunaEvaluatorConfig, coerce_number
@@ -76,10 +76,6 @@ def _confidence_from_score(score: JSONValue) -> float:
     return 1.0
 
 
-def _is_logstream_target(target_type: str | None) -> bool:
-    return (target_type or "").lower() in {"logstream", "log_stream", "log-stream"}
-
-
 @register_evaluator
 class LunaEvaluator(Evaluator[LunaEvaluatorConfig]):
     """Galileo Luna evaluator using the direct scorer invocation API."""
@@ -175,35 +171,25 @@ def _score_matches(self, score: JSONValue) -> bool:
         raise ValueError(f"Unsupported Luna operator: {operator}")
 
     async def evaluate(self, data: Any) -> EvaluatorResult:
-        """Evaluate selected data without runtime context."""
-        return await self.evaluate_with_context(data, context=None)
-
-    async def evaluate_with_context(
-        self,
-        data: Any,
-        context: EvaluationContext | None = None,
-    ) -> EvaluatorResult:
         """Evaluate selected data with Galileo Luna direct scorer invocation.
 
         Args:
             data: The data selected from the runtime step.
-            context: Optional runtime context from the engine.
 
         Returns:
             EvaluatorResult with local threshold decision and scorer metadata.
         """
         input_text, output_text = self._prepare_payload(data)
-        logstream_id = self._resolve_logstream_id(context)
         if not (_has_text(input_text) or _has_text(output_text)):
             return EvaluatorResult(
                 matched=False,
                 confidence=1.0,
                 message="No data to score with Luna",
-                metadata=self._base_metadata(logstream_id=logstream_id),
+                metadata=self._base_metadata(),
             )
 
         try:
-            scorer_kwargs = self._scorer_kwargs(logstream_id=logstream_id)
+            scorer_kwargs = self._scorer_kwargs()
             response = await self._get_client().invoke(
                 **scorer_kwargs,
                 input=input_text if _has_text(input_text) else None,
@@ -217,7 +203,7 @@ async def evaluate_with_context(
                 raise RuntimeError(message)
 
             matched = self._score_matches(response.score)
-            metadata = self._metadata(response, logstream_id=logstream_id)
+            metadata = self._metadata(response)
             operator = self.config.operator
             threshold = self.config.threshold
             state = "triggered" if matched else "not triggered"
@@ -232,25 +218,18 @@ async def evaluate_with_context(
             )
         except Exception as exc:
             logger.error("Luna evaluation error: %s", exc, exc_info=True)
-            return self._handle_error(exc, logstream_id=logstream_id)
-
-    def _resolve_logstream_id(self, context: EvaluationContext | None) -> str | None:
-        if context is not None and _is_logstream_target(context.target_type):
-            return context.target_id
-        return self.config.logstream_id
+            return self._handle_error(exc)
 
-    def _base_metadata(self, *, logstream_id: str | None = None) -> dict[str, Any]:
+    def _base_metadata(self) -> dict[str, Any]:
         metadata = {
-            "logstream_id": logstream_id,
             "scorer_label": self.config.scorer_label,
             "scorer_id": self.config.scorer_id,
             "scorer_version_id": self.config.scorer_version_id,
         }
         return {key: value for key, value in metadata.items() if value is not None}
 
-    def _scorer_kwargs(self, *, logstream_id: str | None = None) -> dict[str, Any]:
+    def _scorer_kwargs(self) -> dict[str, Any]:
         kwargs = {
-            "logstream_id": logstream_id,
             "scorer_label": self.config.scorer_label,
             "scorer_id": self.config.scorer_id,
             "scorer_version_id": self.config.scorer_version_id,
@@ -260,10 +239,8 @@ def _scorer_kwargs(self, *, logstream_id: str | None = None) -> dict[str, Any]:
     def _metadata(
         self,
         response: ScorerInvokeResponse,
-        *,
-        logstream_id: str | None = None,
     ) -> dict[str, Any]:
-        metadata: dict[str, Any] = self._base_metadata(logstream_id=logstream_id)
+        metadata: dict[str, Any] = self._base_metadata()
         metadata.update({
             "scorer_label": response.scorer_label or self.config.scorer_label,
             "score": response.score,
@@ -278,8 +255,6 @@ def _metadata(
     def _handle_error(
         self,
         error: Exception,
-        *,
-        logstream_id: str | None = None,
     ) -> EvaluatorResult:
         error_detail = str(error)
         return EvaluatorResult(
@@ -292,7 +267,6 @@ def _handle_error(
                 "scorer_label": self.config.scorer_label,
                 "scorer_id": self.config.scorer_id,
                 "scorer_version_id": self.config.scorer_version_id,
-                "logstream_id": logstream_id,
             },
             error=error_detail,
         )
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 201d9f73..1b7a6e94 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -27,7 +27,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
 
         # Given: a direct scorer config with local thresholding
         config = LunaEvaluatorConfig(
-            logstream_id="logstream-123",
             scorer_label="toxicity",
             scorer_id="scorer-123",
             scorer_version_id="version-123",
@@ -37,7 +36,6 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
         )
 
         # Then: config is retained without Protect concepts
-        assert config.logstream_id == "logstream-123"
         assert config.scorer_label == "toxicity"
         assert config.scorer_id == "scorer-123"
         assert config.scorer_version_id == "version-123"
@@ -75,7 +73,6 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None:
 
         # Given: a scorer request with scorer config
         request = ScorerInvokeRequest(
-            logstream_id="logstream-123",
             scorer_label="toxicity",
             scorer_id="scorer-123",
             scorer_version_id="version-123",
@@ -85,7 +82,6 @@ def test_scorer_invoke_request_matches_api_schema_shape(self) -> None:
 
         # Then: the serialized payload uses the API-owned scorer invoke fields
         assert request.to_dict() == {
-            "logstream_id": "logstream-123",
             "scorer_label": "toxicity",
             "scorer_id": "scorer-123",
             "scorer_version_id": "version-123",
@@ -409,46 +405,6 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
             timeout=5.0,
         )
 
-    @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
-    @pytest.mark.asyncio
-    async def test_evaluator_passes_logstream_id_from_runtime_context(self) -> None:
-        from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse
-        from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
-        from agent_control_models import EvaluationContext
-
-        evaluator = LunaEvaluator.from_dict(
-            {
-                "logstream_id": "config-logstream",
-                "scorer_label": "toxicity",
-                "scorer_id": "scorer-123",
-                "scorer_version_id": "version-123",
-            }
-        )
-
-        with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
-            mock_invoke.return_value = ScorerInvokeResponse(
-                scorer_label="toxicity",
-                score=0.82,
-                status="success",
-            )
-
-            result = await evaluator.evaluate_with_context(
-                "hello",
-                EvaluationContext(target_type="log_stream", target_id="runtime-logstream"),
-            )
-
-        assert result.metadata["logstream_id"] == "runtime-logstream"
-        mock_invoke.assert_awaited_once_with(
-            logstream_id="runtime-logstream",
-            scorer_label="toxicity",
-            scorer_id="scorer-123",
-            scorer_version_id="version-123",
-            input="hello",
-            output=None,
-            config=None,
-            timeout=10.0,
-        )
-
     @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"})
     @pytest.mark.asyncio
     async def test_evaluator_returns_non_match_below_threshold(self) -> None:
diff --git a/models/src/agent_control_models/__init__.py b/models/src/agent_control_models/__init__.py
index 867367bd..148cdd7a 100644
--- a/models/src/agent_control_models/__init__.py
+++ b/models/src/agent_control_models/__init__.py
@@ -61,7 +61,6 @@
     make_error_type,
 )
 from .evaluation import (
-    EvaluationContext,
     EvaluationRequest,
     EvaluationResponse,
     EvaluationResult,
@@ -133,7 +132,6 @@
     # Policy
     "Policy",
     # Evaluation
-    "EvaluationContext",
     "EvaluationRequest",
     "EvaluationResponse",
     "EvaluationResult",
diff --git a/models/src/agent_control_models/evaluation.py b/models/src/agent_control_models/evaluation.py
index 5c81b0c1..50b5791b 100644
--- a/models/src/agent_control_models/evaluation.py
+++ b/models/src/agent_control_models/evaluation.py
@@ -1,5 +1,4 @@
 """Evaluation-related models."""
-from dataclasses import dataclass
 from typing import Annotated, Literal, Self
 
 from pydantic import Field, StringConstraints, field_validator, model_validator
@@ -14,22 +13,6 @@
 ]
 
 
-@dataclass
-class EvaluationContext:
-    """Runtime context available while evaluating a control.
-
-    This is intentionally small and mutable by normal dataclass semantics so
-    downstream users can subclass it with richer runtime context when needed.
-    Only fields the engine actually populates today are declared here; add new
-    fields only when there is a populator on every supported call path.
-    """
-
-    target_type: str | None = None
-    target_id: str | None = None
-    agent_name: str | None = None
-    step_type: str | None = None
-
-
 class EvaluationRequest(BaseModel):
     """
     Request model for evaluation analysis.

From a06d3f10edbd66fcd6d877d94b02545fb11cd241 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Fri, 15 May 2026 14:59:33 -0700
Subject: [PATCH 14/20] add tests for coverage

---
 .../builtin/tests/list/test_list_extra.py     |  63 ++++++
 evaluators/builtin/tests/regex/__init__.py    |   0
 evaluators/builtin/tests/regex/test_regex.py  | 115 +++++++++++
 .../tests/sql/test_sql_config_validation.py   | 103 ++++++++++
 evaluators/builtin/tests/test_discovery.py    | 187 ++++++++++++++++++
 evaluators/builtin/tests/test_factory.py      | 172 ++++++++++++++++
 evaluators/builtin/tests/test_registry.py     | 119 +++++++++++
 7 files changed, 759 insertions(+)
 create mode 100644 evaluators/builtin/tests/list/test_list_extra.py
 create mode 100644 evaluators/builtin/tests/regex/__init__.py
 create mode 100644 evaluators/builtin/tests/regex/test_regex.py
 create mode 100644 evaluators/builtin/tests/sql/test_sql_config_validation.py
 create mode 100644 evaluators/builtin/tests/test_discovery.py
 create mode 100644 evaluators/builtin/tests/test_factory.py
 create mode 100644 evaluators/builtin/tests/test_registry.py

diff --git a/evaluators/builtin/tests/list/test_list_extra.py b/evaluators/builtin/tests/list/test_list_extra.py
new file mode 100644
index 00000000..ff8fe90a
--- /dev/null
+++ b/evaluators/builtin/tests/list/test_list_extra.py
@@ -0,0 +1,63 @@
+"""Targeted tests covering match_mode branches and edge-case messages."""
+
+from __future__ import annotations
+
+import pytest
+from agent_control_evaluators.list.config import ListEvaluatorConfig
+from agent_control_evaluators.list.evaluator import ListEvaluator
+
+
+@pytest.mark.asyncio
+async def test_match_mode_contains_uses_word_boundary():
+    """contains mode matches whole words but rejects sub-word matches."""
+    config = ListEvaluatorConfig(values=["admin"], match_mode="contains")
+    evaluator = ListEvaluator(config)
+
+    matched = await evaluator.evaluate("the admin user logged in")
+    assert matched.matched is True
+
+    not_matched = await evaluator.evaluate("administrator")  # sub-word, no boundary
+    assert not_matched.matched is False
+
+
+@pytest.mark.asyncio
+async def test_match_mode_exact_is_the_default():
+    """No explicit mode uses anchored exact matching."""
+    config = ListEvaluatorConfig(values=["admin"])
+    evaluator = ListEvaluator(config)
+
+    exact = await evaluator.evaluate("admin")
+    assert exact.matched is True
+
+    partial = await evaluator.evaluate("admin user")  # not anchored end
+    assert partial.matched is False
+
+
+@pytest.mark.asyncio
+async def test_data_none_returns_empty_input_message():
+    """None input is treated as empty and the control is ignored."""
+    config = ListEvaluatorConfig(values=["x"])
+    evaluator = ListEvaluator(config)
+
+    result = await evaluator.evaluate(None)
+
+    assert result.matched is False
+    assert result.message == "Empty input - control ignored"
+    assert result.metadata["input_count"] == 0
+
+
+@pytest.mark.asyncio
+async def test_message_truncates_match_list_at_five():
+    """More than five matches collapse into a ``(+N more)`` suffix."""
+    config = ListEvaluatorConfig(
+        values=["a", "b", "c", "d", "e", "f", "g"],
+        logic="any",
+    )
+    evaluator = ListEvaluator(config)
+
+    result = await evaluator.evaluate(["a", "b", "c", "d", "e", "f", "g"])
+
+    assert result.matched is True
+    # First five matches appear, the rest summarized.
+    assert "a, b, c, d, e" in result.message
+    assert "(+2 more)" in result.message
diff --git a/evaluators/builtin/tests/regex/__init__.py b/evaluators/builtin/tests/regex/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/evaluators/builtin/tests/regex/test_regex.py b/evaluators/builtin/tests/regex/test_regex.py
new file mode 100644
index 00000000..9df69560
--- /dev/null
+++ b/evaluators/builtin/tests/regex/test_regex.py
@@ -0,0 +1,115 @@
+"""Tests for the regex evaluator and its config validation."""
+
+from __future__ import annotations
+
+import pytest
+from agent_control_evaluators.regex.config import RegexEvaluatorConfig
+from agent_control_evaluators.regex.evaluator import RegexEvaluator
+
+
+class TestRegexConfig:
+    """Pattern validation rejects invalid RE2 syntax at config time."""
+
+    def test_valid_pattern_accepted(self):
+        config = RegexEvaluatorConfig(pattern=r"\d{3}-\d{2}-\d{4}")
+        assert config.pattern == r"\d{3}-\d{2}-\d{4}"
+
+    def test_empty_pattern_accepted(self):
+        # Empty string is technically a valid RE2 pattern (matches everything).
+        config = RegexEvaluatorConfig(pattern="")
+        assert config.pattern == ""
+
+    def test_invalid_pattern_rejected(self):
+        with pytest.raises(ValueError, match="Invalid regex pattern"):
+            RegexEvaluatorConfig(pattern="[invalid(regex")
+
+    def test_flags_default_to_none(self):
+        config = RegexEvaluatorConfig(pattern=r"\d+")
+        assert config.flags is None
+
+    def test_flags_can_be_specified(self):
+        config = RegexEvaluatorConfig(pattern="secret", flags=["IGNORECASE"])
+        assert config.flags == ["IGNORECASE"]
+
+
+class TestRegexEvaluator:
+    """Pattern matching against arbitrary data."""
+
+    @pytest.mark.asyncio
+    async def test_match_returns_matched_true(self):
+        evaluator = RegexEvaluator.from_dict({"pattern": r"\d{3}-\d{4}"})
+
+        result = await evaluator.evaluate("call 555-1234 today")
+
+        assert result.matched is True
+        assert result.confidence == 1.0
+        assert "found" in result.message
+        assert result.metadata["pattern"] == r"\d{3}-\d{4}"
+
+    @pytest.mark.asyncio
+    async def test_no_match_returns_matched_false(self):
+        evaluator = RegexEvaluator.from_dict({"pattern": r"\d{3}-\d{4}"})
+
+        result = await evaluator.evaluate("no numbers here")
+
+        assert result.matched is False
+        assert "not found" in result.message
+
+    @pytest.mark.asyncio
+    async def test_none_data_returns_no_data_message(self):
+        evaluator = RegexEvaluator.from_dict({"pattern": r".*"})
+
+        result = await evaluator.evaluate(None)
+
+        assert result.matched is False
+        assert result.message == "No data to match"
+
+    @pytest.mark.asyncio
+    async def test_non_string_data_is_coerced(self):
+        """Non-string inputs are stringified before matching."""
+        evaluator = RegexEvaluator.from_dict({"pattern": r"^42$"})
+
+        result = await evaluator.evaluate(42)
+
+        assert result.matched is True
+
+    @pytest.mark.asyncio
+    async def test_ignorecase_flag_short_form(self):
+        """The ``I`` short form is treated the same as ``IGNORECASE``."""
+        evaluator = RegexEvaluator.from_dict(
+            {"pattern": "SECRET", "flags": ["I"]},
+        )
+
+        result = await evaluator.evaluate("the secret value")
+
+        assert result.matched is True
+
+    @pytest.mark.asyncio
+    async def test_ignorecase_flag_long_form(self):
+        evaluator = RegexEvaluator.from_dict(
+            {"pattern": "secret", "flags": ["IGNORECASE"]},
+        )
+
+        result = await evaluator.evaluate("THE SECRET VALUE")
+
+        assert result.matched is True
+
+    @pytest.mark.asyncio
+    async def test_unknown_flag_is_ignored(self):
+        """RE2 supports a narrow flag set; unknown flag names must not raise."""
+        evaluator = RegexEvaluator.from_dict(
+            {"pattern": "x", "flags": ["MULTILINE"]},
+        )
+
+        result = await evaluator.evaluate("xyz")
+
+        # Should still work — unknown flag is silently dropped, not an error.
+        assert result.matched is True
+
+    @pytest.mark.asyncio
+    async def test_case_sensitive_by_default(self):
+        evaluator = RegexEvaluator.from_dict({"pattern": "Secret"})
+
+        result = await evaluator.evaluate("the secret value")
+
+        assert result.matched is False
diff --git a/evaluators/builtin/tests/sql/test_sql_config_validation.py b/evaluators/builtin/tests/sql/test_sql_config_validation.py
new file mode 100644
index 00000000..8842ed4f
--- /dev/null
+++ b/evaluators/builtin/tests/sql/test_sql_config_validation.py
@@ -0,0 +1,103 @@
+"""Targeted tests for SQLEvaluatorConfig validate_config branches."""
+
+from __future__ import annotations
+
+import warnings
+
+import pytest
+from agent_control_evaluators.sql.config import SQLEvaluatorConfig
+
+
+class TestConflictingRestrictions:
+    """Mutually-exclusive allow/block lists must be rejected at config time."""
+
+    def test_blocked_and_allowed_operations_conflict(self):
+        with pytest.raises(ValueError, match="blocked_operations and allowed_operations"):
+            SQLEvaluatorConfig(
+                blocked_operations=["DELETE"],
+                allowed_operations=["SELECT"],
+            )
+
+    def test_blocked_and_allowed_tables_conflict(self):
+        with pytest.raises(ValueError, match="allowed_tables and blocked_tables"):
+            SQLEvaluatorConfig(
+                allowed_tables=["users"],
+                blocked_tables=["secrets"],
+            )
+
+    def test_blocked_and_allowed_schemas_conflict(self):
+        with pytest.raises(ValueError, match="allowed_schemas and blocked_schemas"):
+            SQLEvaluatorConfig(
+                allowed_schemas=["public"],
+                blocked_schemas=["internal"],
+            )
+
+
+class TestLimitBounds:
+    """Numeric controls must be positive."""
+
+    def test_max_limit_must_be_positive(self):
+        with pytest.raises(ValueError, match="max_limit must be a positive integer"):
+            SQLEvaluatorConfig(max_limit=0)
+
+    def test_max_limit_negative_rejected(self):
+        with pytest.raises(ValueError, match="max_limit must be a positive integer"):
+            SQLEvaluatorConfig(max_limit=-5)
+
+    def test_max_statements_must_be_positive(self):
+        with pytest.raises(ValueError, match="max_statements must be a positive integer"):
+            SQLEvaluatorConfig(
+                allow_multi_statements=True,
+                max_statements=0,
+            )
+
+
+class TestColumnControls:
+    """Column-level validators cover required_column_values shape rules."""
+
+    def test_column_context_without_required_columns_warns(self):
+        with pytest.warns(UserWarning, match="column_context is set but required_columns"):
+            SQLEvaluatorConfig(column_context="where")
+
+    def test_required_column_values_rejects_empty_column_ref(self):
+        with pytest.raises(ValueError, match="empty column reference"):
+            SQLEvaluatorConfig(
+                required_columns=["tenant_id"],
+                required_column_values={"   ": "tenant_id"},
+            )
+
+    def test_required_column_values_rejects_malformed_qualified_ref(self):
+        with pytest.raises(
+            ValueError, match="'table.column' format when qualified"
+        ):
+            SQLEvaluatorConfig(
+                required_columns=["tenant_id"],
+                required_column_values={"users.": "tenant_id"},
+            )
+
+    def test_required_column_values_rejects_blank_qualified_table_side(self):
+        with pytest.raises(
+            ValueError, match="'table.column' format when qualified"
+        ):
+            SQLEvaluatorConfig(
+                required_columns=["tenant_id"],
+                required_column_values={".tenant_id": "tenant_id"},
+            )
+
+    def test_required_column_values_rejects_empty_context_key(self):
+        with pytest.raises(ValueError, match="empty context key"):
+            SQLEvaluatorConfig(
+                required_columns=["tenant_id"],
+                required_column_values={"users.tenant_id": "   "},
+            )
+
+    def test_valid_required_column_values_accepted(self):
+        """Sanity check: a valid combination passes without raising."""
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")  # promote any warning to a failure
+            config = SQLEvaluatorConfig(
+                required_columns=["tenant_id"],
+                column_context="where",
+                required_column_values={"users.tenant_id": "tenant_id"},
+            )
+        assert config.required_column_values == {"users.tenant_id": "tenant_id"}
diff --git a/evaluators/builtin/tests/test_discovery.py b/evaluators/builtin/tests/test_discovery.py
new file mode 100644
index 00000000..62876412
--- /dev/null
+++ b/evaluators/builtin/tests/test_discovery.py
@@ -0,0 +1,187 @@
+"""Tests for entry-point-based evaluator discovery."""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import pytest
+from agent_control_evaluators import (
+    Evaluator,
+    EvaluatorConfig,
+    EvaluatorMetadata,
+    clear_evaluators,
+    discover_evaluators,
+    ensure_evaluators_discovered,
+    get_all_evaluators,
+    list_evaluators,
+    register_evaluator,
+    reset_evaluator_discovery,
+)
+from agent_control_evaluators import _discovery as discovery_module
+from agent_control_models import EvaluatorResult
+
+
+class _DiscoveryConfig(EvaluatorConfig):
+    pass
+
+
+def _make_class(*, name: str, available: bool = True) -> type[Evaluator[_DiscoveryConfig]]:
+    class _Dummy(Evaluator[_DiscoveryConfig]):
+        metadata = EvaluatorMetadata(name=name, version="1.0.0", description="")
+        config_model = _DiscoveryConfig
+
+        @classmethod
+        def is_available(cls) -> bool:
+            return available
+
+        async def evaluate(self, data: Any) -> EvaluatorResult:
+            return EvaluatorResult(matched=False, confidence=1.0, message="")
+
+    _Dummy.__name__ = f"Discovery_{name.replace('-', '_')}"
+    return _Dummy
+
+
+@pytest.fixture
+def isolated_discovery():
+    """Snapshot registry + discovery flag, restore on teardown."""
+    snapshot = dict(get_all_evaluators())
+    clear_evaluators()
+    reset_evaluator_discovery()
+    yield
+    clear_evaluators()
+    reset_evaluator_discovery()
+    for cls in snapshot.values():
+        register_evaluator(cls)
+
+
+def _make_fake_entry_point(name: str, evaluator_class: type[Any]) -> MagicMock:
+    """Build a MagicMock that mimics importlib.metadata.EntryPoint."""
+    ep = MagicMock()
+    ep.name = name
+    ep.load.return_value = evaluator_class
+    return ep
+
+
+def test_discover_evaluators_registers_available_classes(isolated_discovery):
+    """Discover walks the entry-point group and registers each available class."""
+    cls = _make_class(name="disc-a")
+    fake_ep = _make_fake_entry_point("disc-a", cls)
+
+    with patch.object(discovery_module, "entry_points", return_value=[fake_ep]):
+        count = discover_evaluators()
+
+    assert count == 1
+    assert get_all_evaluators().get("disc-a") is cls
+
+
+def test_discover_evaluators_skips_unavailable_classes(isolated_discovery):
+    """Evaluators whose is_available() is False must NOT be registered."""
+    cls = _make_class(name="disc-unavailable", available=False)
+    fake_ep = _make_fake_entry_point("disc-unavailable", cls)
+
+    with patch.object(discovery_module, "entry_points", return_value=[fake_ep]):
+        count = discover_evaluators()
+
+    assert count == 0
+    assert "disc-unavailable" not in get_all_evaluators()
+
+
+def test_discover_evaluators_skips_already_registered(isolated_discovery):
+    """Already-registered names are skipped without raising."""
+    cls = _make_class(name="disc-existing")
+    register_evaluator(cls)
+
+    fake_ep = _make_fake_entry_point("disc-existing", cls)
+    with patch.object(discovery_module, "entry_points", return_value=[fake_ep]):
+        count = discover_evaluators()
+
+    assert count == 0
+
+
+def test_discover_evaluators_only_runs_once(isolated_discovery):
+    """Repeat calls short-circuit on the _DISCOVERY_COMPLETE flag."""
+    cls = _make_class(name="disc-once")
+    fake_ep = _make_fake_entry_point("disc-once", cls)
+
+    with patch.object(
+        discovery_module, "entry_points", return_value=[fake_ep]
+    ) as patched:
+        first = discover_evaluators()
+        second = discover_evaluators()
+
+    # First call discovers, second returns 0 without consulting entry_points.
+    assert first == 1
+    assert second == 0
+    assert patched.call_count == 1
+
+
+def test_discover_evaluators_swallows_load_failures(isolated_discovery):
+    """A broken entry point is logged and skipped, not propagated."""
+    bad_ep = MagicMock()
+    bad_ep.name = "broken"
+    bad_ep.load.side_effect = RuntimeError("boom")
+
+    good_cls = _make_class(name="disc-good")
+    good_ep = _make_fake_entry_point("disc-good", good_cls)
+
+    with patch.object(discovery_module, "entry_points", return_value=[bad_ep, good_ep]):
+        count = discover_evaluators()
+
+    assert count == 1
+    assert get_all_evaluators().get("disc-good") is good_cls
+
+
+def test_discover_evaluators_handles_entry_points_failure(isolated_discovery):
+    """If entry_points() itself raises, discovery completes with zero results."""
+    with patch.object(
+        discovery_module,
+        "entry_points",
+        side_effect=RuntimeError("entry-point system unavailable"),
+    ):
+        count = discover_evaluators()
+
+    assert count == 0
+
+
+def test_reset_evaluator_discovery_allows_rerun(isolated_discovery):
+    """reset_evaluator_discovery clears the completed flag so discover runs again."""
+    cls = _make_class(name="disc-reset")
+    fake_ep = _make_fake_entry_point("disc-reset", cls)
+
+    with patch.object(
+        discovery_module, "entry_points", return_value=[fake_ep]
+    ) as patched:
+        discover_evaluators()
+        clear_evaluators()
+        reset_evaluator_discovery()
+        count = discover_evaluators()
+
+    assert count == 1
+    assert patched.call_count == 2
+
+
+def test_ensure_evaluators_discovered_runs_once(isolated_discovery):
+    """ensure_evaluators_discovered is the lazy-init entry point."""
+    cls = _make_class(name="disc-ensure")
+    fake_ep = _make_fake_entry_point("disc-ensure", cls)
+
+    with patch.object(
+        discovery_module, "entry_points", return_value=[fake_ep]
+    ) as patched:
+        ensure_evaluators_discovered()
+        ensure_evaluators_discovered()
+
+    assert patched.call_count == 1
+    assert get_all_evaluators().get("disc-ensure") is cls
+
+
+def test_list_evaluators_triggers_discovery(isolated_discovery):
+    """list_evaluators is the convenience accessor; it must trigger discovery."""
+    cls = _make_class(name="disc-list")
+    fake_ep = _make_fake_entry_point("disc-list", cls)
+
+    with patch.object(discovery_module, "entry_points", return_value=[fake_ep]):
+        result = list_evaluators()
+
+    assert result.get("disc-list") is cls
diff --git a/evaluators/builtin/tests/test_factory.py b/evaluators/builtin/tests/test_factory.py
new file mode 100644
index 00000000..4bba4b82
--- /dev/null
+++ b/evaluators/builtin/tests/test_factory.py
@@ -0,0 +1,172 @@
+"""Tests for the LRU-cached evaluator factory."""
+
+from __future__ import annotations
+
+import importlib
+from typing import Any
+
+import pytest
+from agent_control_evaluators import (
+    Evaluator,
+    EvaluatorConfig,
+    EvaluatorMetadata,
+    clear_evaluator_cache,
+    clear_evaluators,
+    get_all_evaluators,
+    get_evaluator_instance,
+    register_evaluator,
+)
+from agent_control_evaluators import _factory as factory_module
+from agent_control_models import EvaluatorResult, EvaluatorSpec
+
+
+class _FactoryConfig(EvaluatorConfig):
+    payload: str = "default"
+
+
+class _FactoryEvaluator(Evaluator[_FactoryConfig]):
+    metadata = EvaluatorMetadata(name="factory-dummy", version="1.0.0", description="")
+    config_model = _FactoryConfig
+
+    async def evaluate(self, data: Any) -> EvaluatorResult:
+        return EvaluatorResult(matched=False, confidence=1.0, message="")
+
+
+@pytest.fixture
+def isolated_factory():
+    """Snapshot registry/cache so factory tests don't leak state."""
+    snapshot = dict(get_all_evaluators())
+    clear_evaluators()
+    clear_evaluator_cache()
+    register_evaluator(_FactoryEvaluator)
+    yield
+    clear_evaluator_cache()
+    clear_evaluators()
+    for cls in snapshot.values():
+        register_evaluator(cls)
+
+
+def test_get_evaluator_instance_returns_evaluator(isolated_factory):
+    spec = EvaluatorSpec(name="factory-dummy", config={"payload": "p1"})
+
+    instance = get_evaluator_instance(spec)
+
+    assert isinstance(instance, _FactoryEvaluator)
+    assert instance.config.payload == "p1"
+
+
+def test_get_evaluator_instance_caches_by_config(isolated_factory):
+    spec_a = EvaluatorSpec(name="factory-dummy", config={"payload": "same"})
+    spec_b = EvaluatorSpec(name="factory-dummy", config={"payload": "same"})
+
+    first = get_evaluator_instance(spec_a)
+    second = get_evaluator_instance(spec_b)
+
+    # Same config = same cached instance.
+    assert first is second
+
+
+def test_get_evaluator_instance_treats_different_configs_separately(isolated_factory):
+    spec_a = EvaluatorSpec(name="factory-dummy", config={"payload": "a"})
+    spec_b = EvaluatorSpec(name="factory-dummy", config={"payload": "b"})
+
+    instance_a = get_evaluator_instance(spec_a)
+    instance_b = get_evaluator_instance(spec_b)
+
+    assert instance_a is not instance_b
+    assert instance_a.config.payload == "a"
+    assert instance_b.config.payload == "b"
+
+
+def test_get_evaluator_instance_raises_for_unknown_evaluator(isolated_factory):
+    with pytest.raises(ValueError, match="not found"):
+        get_evaluator_instance(EvaluatorSpec(name="no-such-evaluator", config={}))
+
+
+def test_clear_evaluator_cache_forces_recreation(isolated_factory):
+    spec = EvaluatorSpec(name="factory-dummy", config={"payload": "p"})
+
+    first = get_evaluator_instance(spec)
+    clear_evaluator_cache()
+    second = get_evaluator_instance(spec)
+
+    assert first is not second
+
+
+def test_get_evaluator_instance_evicts_oldest_when_full(isolated_factory, monkeypatch):
+    """LRU eviction: when cache is full, the least-recently-used entry is dropped."""
+    # Force a tiny cache so we can observe eviction without overhead.
+    monkeypatch.setattr(factory_module, "EVALUATOR_CACHE_SIZE", 2)
+
+    spec_a = EvaluatorSpec(name="factory-dummy", config={"payload": "a"})
+    spec_b = EvaluatorSpec(name="factory-dummy", config={"payload": "b"})
+    spec_c = EvaluatorSpec(name="factory-dummy", config={"payload": "c"})
+
+    first_a = get_evaluator_instance(spec_a)
+    get_evaluator_instance(spec_b)
+    # Insert third → "a" is the LRU and must be evicted.
+    get_evaluator_instance(spec_c)
+
+    re_a = get_evaluator_instance(spec_a)
+    # "a" was evicted: new instance must NOT be the original.
+    assert re_a is not first_a
+
+
+def test_get_evaluator_instance_moves_hit_to_most_recent(
+    isolated_factory, monkeypatch
+):
+    """Cache hit must refresh LRU recency so the touched entry isn't evicted next."""
+    monkeypatch.setattr(factory_module, "EVALUATOR_CACHE_SIZE", 2)
+
+    spec_a = EvaluatorSpec(name="factory-dummy", config={"payload": "a"})
+    spec_b = EvaluatorSpec(name="factory-dummy", config={"payload": "b"})
+    spec_c = EvaluatorSpec(name="factory-dummy", config={"payload": "c"})
+
+    first_a = get_evaluator_instance(spec_a)
+    get_evaluator_instance(spec_b)
+    # Touch "a" so "b" becomes the LRU.
+    re_a = get_evaluator_instance(spec_a)
+    assert re_a is first_a
+
+    # Inserting "c" should evict "b", not "a".
+    get_evaluator_instance(spec_c)
+
+    refetched_a = get_evaluator_instance(spec_a)
+    assert refetched_a is first_a  # still cached
+
+
+def test_parse_cache_size_uses_default_when_unset(monkeypatch):
+    monkeypatch.delenv("EVALUATOR_CACHE_SIZE", raising=False)
+    reloaded = importlib.reload(factory_module)
+    try:
+        assert reloaded.EVALUATOR_CACHE_SIZE == factory_module.DEFAULT_CACHE_SIZE
+    finally:
+        importlib.reload(factory_module)
+
+
+def test_parse_cache_size_falls_back_on_invalid_value(monkeypatch):
+    monkeypatch.setenv("EVALUATOR_CACHE_SIZE", "not-a-number")
+    reloaded = importlib.reload(factory_module)
+    try:
+        assert reloaded.EVALUATOR_CACHE_SIZE == reloaded.DEFAULT_CACHE_SIZE
+    finally:
+        importlib.reload(factory_module)
+
+
+def test_parse_cache_size_clamps_to_minimum(monkeypatch):
+    monkeypatch.setenv("EVALUATOR_CACHE_SIZE", "0")
+    reloaded = importlib.reload(factory_module)
+    try:
+        # Anything below MIN_CACHE_SIZE is clamped to avoid infinite eviction loops.
+        assert reloaded.EVALUATOR_CACHE_SIZE >= reloaded.MIN_CACHE_SIZE
+    finally:
+        importlib.reload(factory_module)
+
+
+def test_parse_cache_size_accepts_valid_int(monkeypatch):
+    monkeypatch.setenv("EVALUATOR_CACHE_SIZE", "42")
+    reloaded = importlib.reload(factory_module)
+    try:
+        assert reloaded.EVALUATOR_CACHE_SIZE == 42
+    finally:
+        importlib.reload(factory_module)
diff --git a/evaluators/builtin/tests/test_registry.py b/evaluators/builtin/tests/test_registry.py
new file mode 100644
index 00000000..6b663129
--- /dev/null
+++ b/evaluators/builtin/tests/test_registry.py
@@ -0,0 +1,119 @@
+"""Tests for the in-memory evaluator registry."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+from agent_control_evaluators import (
+    Evaluator,
+    EvaluatorConfig,
+    EvaluatorMetadata,
+    clear_evaluators,
+    get_all_evaluators,
+    get_evaluator,
+    register_evaluator,
+)
+from agent_control_models import EvaluatorResult
+
+
+class _DummyConfig(EvaluatorConfig):
+    pass
+
+
+def _make_class(*, name: str, available: bool = True) -> type[Evaluator[_DummyConfig]]:
+    """Build a fresh Evaluator subclass with the supplied metadata name."""
+
+    class _Dummy(Evaluator[_DummyConfig]):
+        metadata = EvaluatorMetadata(
+            name=name,
+            version="1.0.0",
+            description="",
+        )
+        config_model = _DummyConfig
+
+        @classmethod
+        def is_available(cls) -> bool:
+            return available
+
+        async def evaluate(self, data: Any) -> EvaluatorResult:
+            return EvaluatorResult(matched=False, confidence=1.0, message="")
+
+    _Dummy.__name__ = f"Dummy_{name.replace('-', '_')}"
+    return _Dummy
+
+
+@pytest.fixture
+def isolated_registry():
+    """Snapshot and restore the global registry so tests don't leak state."""
+    snapshot = dict(get_all_evaluators())
+    clear_evaluators()
+    yield
+    clear_evaluators()
+    for cls in snapshot.values():
+        register_evaluator(cls)
+
+
+def test_register_and_lookup_evaluator(isolated_registry):
+    cls = _make_class(name="reg-a")
+
+    register_evaluator(cls)
+
+    assert get_evaluator("reg-a") is cls
+
+
+def test_get_evaluator_returns_none_when_not_registered(isolated_registry):
+    assert get_evaluator("does-not-exist") is None
+
+
+def test_get_all_evaluators_returns_copy(isolated_registry):
+    cls = _make_class(name="reg-copy")
+    register_evaluator(cls)
+
+    snapshot = get_all_evaluators()
+    snapshot["evil"] = cls  # mutate the returned dict
+
+    # Internal registry must not reflect external mutation.
+    assert "evil" not in get_all_evaluators()
+
+
+def test_register_is_idempotent_for_same_class(isolated_registry):
+    cls = _make_class(name="reg-idem")
+
+    register_evaluator(cls)
+    # Registering the exact same class again must not raise.
+    assert register_evaluator(cls) is cls
+
+
+def test_register_rejects_name_collision_with_different_class(isolated_registry):
+    first = _make_class(name="reg-conflict")
+    second = _make_class(name="reg-conflict")
+    register_evaluator(first)
+
+    with pytest.raises(ValueError, match="already registered"):
+        register_evaluator(second)
+
+
+def test_register_skips_unavailable_evaluators(isolated_registry):
+    cls = _make_class(name="reg-unavailable", available=False)
+
+    # Should not raise and should not register.
+    assert register_evaluator(cls) is cls
+    assert get_evaluator("reg-unavailable") is None
+
+
+def test_clear_evaluators_empties_registry(isolated_registry):
+    register_evaluator(_make_class(name="reg-c1"))
+    register_evaluator(_make_class(name="reg-c2"))
+    assert len(get_all_evaluators()) == 2
+
+    clear_evaluators()
+
+    assert get_all_evaluators() == {}
+
+
+def test_register_decorator_returns_class(isolated_registry):
+    cls = _make_class(name="reg-decorator")
+    # The function is documented as decorator-compatible: it must return the class.
+    decorated = register_evaluator(cls)
+    assert decorated is cls

From c8662f72fe1389d8cede20e694317a3c0a784fb7 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Sat, 16 May 2026 06:34:08 -0700
Subject: [PATCH 15/20] coverage

---
 .../galileo/tests/test_luna_coverage_gaps.py  | 567 ++++++++++++++++++
 .../tests/test_evaluators_optional_imports.py |  93 +++
 2 files changed, 660 insertions(+)
 create mode 100644 evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py
 create mode 100644 sdks/python/tests/test_evaluators_optional_imports.py

diff --git a/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py b/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py
new file mode 100644
index 00000000..f4d0e360
--- /dev/null
+++ b/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py
@@ -0,0 +1,567 @@
+"""Targeted tests filling coverage gaps in luna/evaluator.py and luna/client.py.
+
+These tests cover the small utility functions and rare branches that the
+integration-style tests in ``test_luna_evaluator.py`` skip past.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+import pytest
+
+
+# =============================================================================
+# luna/evaluator.py: utility helpers
+# =============================================================================
+
+
+class TestCoercePayloadText:
+    """``_coerce_payload_text`` normalises arbitrary values to strings."""
+
+    def test_none_returns_none(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text
+
+        assert _coerce_payload_text(None) is None
+
+    def test_string_passed_through(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text
+
+        assert _coerce_payload_text("hello") == "hello"
+
+    @pytest.mark.parametrize("value", [42, 3.14, True])
+    def test_scalars_stringified(self, value):
+        from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text
+
+        assert _coerce_payload_text(value) == str(value)
+
+    def test_dict_is_json_serialized(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text
+
+        result = _coerce_payload_text({"a": 1, "b": 2})
+
+        assert json.loads(result) == {"a": 1, "b": 2}
+
+    def test_unserialisable_falls_back_to_str(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _coerce_payload_text
+
+        class CannotJson:
+            def __repr__(self):
+                return "<CannotJson>"
+
+        # json.dumps with default=str would actually serialize this, so use
+        # something that breaks both the JSON pass AND triggers TypeError.
+        cannot = CannotJson()
+        result = _coerce_payload_text({"obj": cannot})
+
+        # default=str converts the inner object, so we still get a JSON string.
+        assert isinstance(result, str)
+
+
+class TestExtractDictText:
+    """``_extract_dict_text`` returns ``None`` for missing keys."""
+
+    def test_missing_key_returns_none(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _extract_dict_text
+
+        assert _extract_dict_text({}, "absent") is None
+
+    def test_present_key_coerced(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _extract_dict_text
+
+        assert _extract_dict_text({"x": 7}, "x") == "7"
+
+
+class TestContains:
+    """``_contains`` supports str/list/dict scores against a threshold."""
+
+    def test_none_threshold_is_no_match(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _contains
+
+        assert _contains("anything", None) is False
+
+    def test_string_contains_substring(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _contains
+
+        assert _contains("hello world", "world") is True
+        assert _contains("hello world", "absent") is False
+
+    def test_list_contains_value(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _contains
+
+        assert _contains(["a", "b", "c"], "b") is True
+        assert _contains(["a", "b", "c"], "z") is False
+
+    def test_dict_threshold_matches_key(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _contains
+
+        assert _contains({"toxicity": 0.9}, "toxicity") is True
+
+    def test_dict_threshold_matches_value(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _contains
+
+        assert _contains({"label": "flagged"}, "flagged") is True
+
+    def test_other_types_return_false(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _contains
+
+        # Non-iterable score => no match.
+        assert _contains(42, 42) is False
+
+
+class TestConfidenceFromScore:
+    """``_confidence_from_score`` maps a raw score to [0, 1]."""
+
+    def test_true_bool_maps_to_one(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score
+
+        assert _confidence_from_score(True) == 1.0
+
+    def test_false_bool_maps_to_zero(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score
+
+        assert _confidence_from_score(False) == 0.0
+
+    def test_in_range_number_returned_as_is(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score
+
+        assert _confidence_from_score(0.42) == 0.42
+
+    def test_out_of_range_falls_back_to_one(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score
+
+        # Above 1.0 → fall back to default confidence
+        assert _confidence_from_score(7.2) == 1.0
+
+    def test_non_numeric_falls_back_to_one(self):
+        from agent_control_evaluator_galileo.luna.evaluator import _confidence_from_score
+
+        assert _confidence_from_score("not-a-number") == 1.0
+
+
+# =============================================================================
+# luna/evaluator.py: _score_matches operator branches
+# =============================================================================
+
+
+@pytest.fixture
+def luna_evaluator(monkeypatch):
+    """A ready-to-use LunaEvaluator instance with auth env wired up."""
+    monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+    from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+    return LunaEvaluator.from_dict(
+        {"scorer_label": "toxicity", "threshold": 0.5, "operator": "gte"}
+    )
+
+
+class TestScoreMatchesOperators:
+    """Every operator branch in ``_score_matches`` should evaluate."""
+
+    def _make(self, operator, threshold, monkeypatch):
+        monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+        if operator in {"eq", "ne", "contains"}:
+            threshold_value = threshold
+        else:
+            threshold_value = threshold
+        return LunaEvaluator.from_dict(
+            {"scorer_label": "toxicity", "threshold": threshold_value, "operator": operator}
+        )
+
+    def test_any_truthy_score_matches(self, monkeypatch):
+        evaluator = self._make("any", 0.5, monkeypatch)
+        assert evaluator._score_matches(1) is True
+        assert evaluator._score_matches(0) is False
+
+    def test_eq_matches_threshold(self, monkeypatch):
+        evaluator = self._make("eq", "flagged", monkeypatch)
+        assert evaluator._score_matches("flagged") is True
+        assert evaluator._score_matches("safe") is False
+
+    def test_ne_matches_when_different(self, monkeypatch):
+        evaluator = self._make("ne", "flagged", monkeypatch)
+        assert evaluator._score_matches("safe") is True
+        assert evaluator._score_matches("flagged") is False
+
+    def test_contains_matches_substring(self, monkeypatch):
+        evaluator = self._make("contains", "flag", monkeypatch)
+        assert evaluator._score_matches("flagged") is True
+        assert evaluator._score_matches("clean") is False
+
+    def test_numeric_operators_all_branches(self, monkeypatch):
+        for op, expectations in [
+            ("gt", [(0.9, True), (0.5, False)]),
+            ("gte", [(0.5, True), (0.4, False)]),
+            ("lt", [(0.4, True), (0.5, False)]),
+            ("lte", [(0.5, True), (0.6, False)]),
+        ]:
+            evaluator = self._make(op, 0.5, monkeypatch)
+            for score, expected in expectations:
+                assert evaluator._score_matches(score) is expected, (op, score)
+
+    def test_numeric_operator_rejects_non_numeric_score(self, monkeypatch):
+        evaluator = self._make("gte", 0.5, monkeypatch)
+        with pytest.raises(ValueError, match="not numeric"):
+            evaluator._score_matches("not-a-number")
+
+
+# =============================================================================
+# luna/evaluator.py: payload preparation + aclose
+# =============================================================================
+
+
+class TestPreparePayload:
+    """``_prepare_payload`` routes scalar data based on the scorer label."""
+
+    def test_scalar_routed_to_input_when_label_lacks_output(self, monkeypatch):
+        monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+        evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
+
+        input_text, output_text = evaluator._prepare_payload("hello")
+
+        assert input_text == "hello"
+        assert output_text is None
+
+    def test_scalar_routed_to_output_when_label_contains_output(self, monkeypatch):
+        monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+        evaluator = LunaEvaluator.from_dict(
+            {"scorer_label": "output_correctness", "threshold": 0.5}
+        )
+
+        input_text, output_text = evaluator._prepare_payload("hello")
+
+        assert input_text is None
+        assert output_text == "hello"
+
+
+@pytest.mark.asyncio
+async def test_evaluator_aclose_closes_underlying_client(monkeypatch):
+    """``aclose`` must release the HTTP client when one was created."""
+    monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+    from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+    evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
+
+    fake = MagicMock()
+    fake.close = AsyncMock()
+    evaluator._client = fake
+
+    await evaluator.aclose()
+
+    fake.close.assert_awaited_once()
+    assert evaluator._client is None
+
+
+@pytest.mark.asyncio
+async def test_evaluator_handles_non_success_status(monkeypatch):
+    """A non-success status from the scorer must surface as an error result."""
+    monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+    from agent_control_evaluator_galileo.luna import LunaEvaluator, ScorerInvokeResponse
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    evaluator = LunaEvaluator.from_dict(
+        {"scorer_label": "toxicity", "threshold": 0.5, "operator": "gte"}
+    )
+
+    with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
+        mock_invoke.return_value = ScorerInvokeResponse(
+            scorer_label="toxicity",
+            score=None,
+            status="failed",
+            error_message="upstream timeout",
+        )
+
+        result = await evaluator.evaluate("hello")
+
+    assert result.matched is False
+    assert result.error is not None
+    assert "upstream timeout" in result.error
+
+
+# =============================================================================
+# luna/evaluator.py: package version fallback
+# =============================================================================
+
+
+def test_resolve_package_version_falls_back_when_metadata_missing():
+    """The dev fallback must trigger when the package isn't installed by metadata."""
+    from importlib.metadata import PackageNotFoundError
+
+    from agent_control_evaluator_galileo.luna import evaluator as evaluator_module
+
+    with patch.object(evaluator_module, "version", side_effect=PackageNotFoundError):
+        result = evaluator_module._resolve_package_version()
+
+    assert result == "0.0.0.dev"
+
+
+# =============================================================================
+# luna/client.py: small helpers + branches
+# =============================================================================
+
+
+class TestAsFloatOrNone:
+    """``_as_float_or_none`` parses scalar values; strings may fail."""
+
+    def test_returns_none_for_bool(self):
+        from agent_control_evaluator_galileo.luna.client import _as_float_or_none
+
+        assert _as_float_or_none(True) is None
+
+    def test_returns_none_for_none(self):
+        from agent_control_evaluator_galileo.luna.client import _as_float_or_none
+
+        assert _as_float_or_none(None) is None
+
+    def test_returns_float_for_int(self):
+        from agent_control_evaluator_galileo.luna.client import _as_float_or_none
+
+        assert _as_float_or_none(7) == 7.0
+
+    def test_returns_float_for_string_number(self):
+        from agent_control_evaluator_galileo.luna.client import _as_float_or_none
+
+        assert _as_float_or_none("0.42") == 0.42
+
+    def test_returns_none_for_unparseable_string(self):
+        from agent_control_evaluator_galileo.luna.client import _as_float_or_none
+
+        assert _as_float_or_none("not-a-number") is None
+
+    def test_returns_none_for_other_types(self):
+        from agent_control_evaluator_galileo.luna.client import _as_float_or_none
+
+        assert _as_float_or_none([1, 2]) is None
+
+
+class TestHasValue:
+    """``_has_value`` is the "is this scorable" predicate."""
+
+    def test_none_is_empty(self):
+        from agent_control_evaluator_galileo.luna.client import _has_value
+
+        assert _has_value(None) is False
+
+    def test_empty_string_is_empty(self):
+        from agent_control_evaluator_galileo.luna.client import _has_value
+
+        assert _has_value("") is False
+        assert _has_value("   ") is False
+
+    def test_non_empty_string_has_value(self):
+        from agent_control_evaluator_galileo.luna.client import _has_value
+
+        assert _has_value("hi") is True
+
+    def test_empty_list_or_dict_is_empty(self):
+        from agent_control_evaluator_galileo.luna.client import _has_value
+
+        assert _has_value([]) is False
+        assert _has_value({}) is False
+
+    def test_non_empty_list_or_dict_has_value(self):
+        from agent_control_evaluator_galileo.luna.client import _has_value
+
+        assert _has_value([1]) is True
+        assert _has_value({"k": "v"}) is True
+
+    def test_scalar_other_types_have_value(self):
+        from agent_control_evaluator_galileo.luna.client import _has_value
+
+        assert _has_value(42) is True
+        assert _has_value(0) is True  # 0 is a real value, not empty
+        assert _has_value(True) is True
+
+
+class TestScorerInvokeRequestValidation:
+    """``ScorerInvokeRequest`` rejects malformed input combos."""
+
+    def test_missing_all_identifiers_raises(self):
+        from agent_control_evaluator_galileo.luna.client import (
+            ScorerInvokeInputs,
+            ScorerInvokeRequest,
+        )
+        from pydantic import ValidationError
+
+        with pytest.raises(ValidationError, match="One of scorer_label"):
+            ScorerInvokeRequest(inputs=ScorerInvokeInputs(query="hello"))
+
+
+def test_client_raises_when_no_credentials(monkeypatch):
+    """The client requires at least an API secret or an API key."""
+    for name in (
+        "GALILEO_API_SECRET_KEY",
+        "GALILEO_API_SECRET",
+        "GALILEO_API_KEY",
+    ):
+        monkeypatch.delenv(name, raising=False)
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY"):
+        GalileoLunaClient()
+
+
+class TestDeriveApiUrl:
+    """URL derivation covers every console.* → api.* substitution branch."""
+
+    def _client(self, monkeypatch):
+        monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+        from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+        return GalileoLunaClient()
+
+    def test_console_dot_rewritten_to_api_dot(self, monkeypatch):
+        client = self._client(monkeypatch)
+        assert (
+            client._derive_api_url("https://console.galileo.ai")
+            == "https://api.galileo.ai"
+        )
+
+    def test_console_dash_rewritten_to_api_dash(self, monkeypatch):
+        client = self._client(monkeypatch)
+        assert (
+            client._derive_api_url("https://console-staging.galileo.ai")
+            == "https://api-staging.galileo.ai"
+        )
+
+    def test_plain_https_host_gets_api_prefix(self, monkeypatch):
+        client = self._client(monkeypatch)
+        assert (
+            client._derive_api_url("https://example.com")
+            == "https://api.example.com"
+        )
+
+    def test_plain_http_host_gets_api_prefix(self, monkeypatch):
+        client = self._client(monkeypatch)
+        assert client._derive_api_url("http://example.com") == "http://api.example.com"
+
+    def test_unknown_scheme_returned_as_is(self, monkeypatch):
+        client = self._client(monkeypatch)
+        # No console./console- prefix, no http(s) scheme → return unchanged.
+        assert client._derive_api_url("api.example.com") == "api.example.com"
+
+
+@pytest.mark.asyncio
+async def test_get_client_adds_api_key_header_when_no_secret(monkeypatch):
+    """When only an API key is configured, the public-API header is set."""
+    monkeypatch.delenv("GALILEO_API_SECRET_KEY", raising=False)
+    monkeypatch.delenv("GALILEO_API_SECRET", raising=False)
+    monkeypatch.setenv("GALILEO_API_KEY", "public-key")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    client = GalileoLunaClient()
+    http_client = await client._get_client()
+    try:
+        assert http_client.headers.get("Galileo-API-Key") == "public-key"
+    finally:
+        await client.close()
+
+
+@pytest.mark.asyncio
+async def test_invoke_rejects_missing_scorer_identifier(monkeypatch):
+    monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    client = GalileoLunaClient()
+    try:
+        with pytest.raises(ValueError, match="At least one scorer identifier"):
+            await client.invoke(input="hello")
+    finally:
+        await client.close()
+
+
+@pytest.mark.asyncio
+async def test_invoke_raises_when_response_is_not_a_json_object(monkeypatch):
+    """A non-object JSON body must surface as a clear RuntimeError."""
+    monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    client = GalileoLunaClient()
+
+    fake_response = MagicMock()
+    fake_response.raise_for_status = MagicMock()
+    fake_response.json = MagicMock(return_value=["not", "an", "object"])
+
+    fake_http = AsyncMock()
+    fake_http.post = AsyncMock(return_value=fake_response)
+    fake_http.is_closed = False
+    client._client = fake_http
+
+    try:
+        with pytest.raises(RuntimeError, match="not a JSON object"):
+            await client.invoke(scorer_label="toxicity", input="hello")
+    finally:
+        await client.close()
+
+
+@pytest.mark.asyncio
+async def test_invoke_propagates_http_status_error(monkeypatch):
+    """The client logs and re-raises HTTP status errors."""
+    monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    client = GalileoLunaClient()
+
+    fake_response = MagicMock(spec=httpx.Response)
+    fake_response.status_code = 500
+    fake_response.text = "internal error"
+    fake_response.raise_for_status = MagicMock(
+        side_effect=httpx.HTTPStatusError(
+            "boom", request=MagicMock(spec=httpx.Request), response=fake_response
+        )
+    )
+
+    fake_http = AsyncMock()
+    fake_http.post = AsyncMock(return_value=fake_response)
+    fake_http.is_closed = False
+    client._client = fake_http
+
+    try:
+        with pytest.raises(httpx.HTTPStatusError):
+            await client.invoke(scorer_label="toxicity", input="hello")
+    finally:
+        await client.close()
+
+
+@pytest.mark.asyncio
+async def test_invoke_propagates_request_error(monkeypatch):
+    """RequestError is logged and re-raised so callers can decide policy."""
+    monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    client = GalileoLunaClient()
+
+    fake_http = AsyncMock()
+    fake_http.post = AsyncMock(side_effect=httpx.RequestError("network down"))
+    fake_http.is_closed = False
+    client._client = fake_http
+
+    try:
+        with pytest.raises(httpx.RequestError):
+            await client.invoke(scorer_label="toxicity", input="hello")
+    finally:
+        await client.close()
+
+
+@pytest.mark.asyncio
+async def test_client_async_context_manager_closes_on_exit(monkeypatch):
+    """Entering/exiting the async context manager must close the client."""
+    monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    async with GalileoLunaClient() as client:
+        # Trigger lazy client creation so close() has work to do.
+        await client._get_client()
+        assert client._client is not None
+
+    # __aexit__ closes the underlying httpx client.
+    assert client._client is None
diff --git a/sdks/python/tests/test_evaluators_optional_imports.py b/sdks/python/tests/test_evaluators_optional_imports.py
new file mode 100644
index 00000000..a2842a1b
--- /dev/null
+++ b/sdks/python/tests/test_evaluators_optional_imports.py
@@ -0,0 +1,93 @@
+"""Coverage for the optional galileo import fallbacks in agent_control.evaluators.
+
+The galileo extras are normally installed in the dev environment, so the
+``except ImportError`` branches in ``agent_control/evaluators/__init__.py``
+never fire under regular tests. This module forces those failures by hiding
+the relevant modules in ``sys.modules`` and reloading the package.
+"""
+
+from __future__ import annotations
+
+import builtins
+import importlib
+import sys
+
+
+def _reload_evaluators_with_blocked(prefix: str) -> object:
+    """Reload ``agent_control.evaluators`` while ``prefix.*`` imports fail.
+
+    Returns the freshly loaded module so callers can inspect ``__all__``.
+    Restores the original ``builtins.__import__`` and ``sys.modules`` entries
+    on the way out.
+    """
+    original_import = builtins.__import__
+
+    def fail_for_prefix(name: str, *args: object, **kwargs: object) -> object:
+        if name == prefix or name.startswith(f"{prefix}."):
+            raise ImportError(f"forced failure for {name}")
+        return original_import(name, *args, **kwargs)  # type: ignore[arg-type]
+
+    # Drop any cached entries so the patched import is consulted.
+    blocked_modules = [m for m in list(sys.modules) if m == prefix or m.startswith(f"{prefix}.")]
+    saved_modules = {m: sys.modules.pop(m) for m in blocked_modules}
+    saved_evaluators = sys.modules.pop("agent_control.evaluators", None)
+
+    builtins.__import__ = fail_for_prefix
+    try:
+        import agent_control.evaluators as reloaded
+
+        reloaded = importlib.reload(reloaded)
+        return reloaded
+    finally:
+        builtins.__import__ = original_import
+        # Restore the cached modules so other tests keep their state.
+        for name, module in saved_modules.items():
+            sys.modules[name] = module
+        if saved_evaluators is not None:
+            sys.modules["agent_control.evaluators"] = saved_evaluators
+
+
+def test_module_loads_when_galileo_luna_is_unavailable():
+    """Hiding ``agent_control_evaluator_galileo.luna`` exercises its except branch."""
+    reloaded = _reload_evaluators_with_blocked("agent_control_evaluator_galileo.luna")
+
+    # Core names are always present.
+    assert "Evaluator" in reloaded.__all__
+    # Luna1 names are NOT present because the import failed.
+    assert "LunaEvaluator" not in reloaded.__all__
+    assert "GalileoLunaClient" not in reloaded.__all__
+
+
+def test_module_loads_when_galileo_package_is_unavailable():
+    """Hiding the whole package exercises both ImportError fallbacks at once."""
+    reloaded = _reload_evaluators_with_blocked("agent_control_evaluator_galileo")
+
+    assert "Evaluator" in reloaded.__all__
+    # Both luna1 and luna2 optional names are absent.
+    for absent in (
+        "LunaEvaluator",
+        "GalileoLunaClient",
+        "Luna2Evaluator",
+        "Luna2EvaluatorConfig",
+        "LUNA_AVAILABLE",
+        "LUNA2_AVAILABLE",
+    ):
+        assert absent not in reloaded.__all__
+
+
+def test_module_loads_galileo_optional_imports_when_available():
+    """Sanity check: with galileo installed, the optional names ARE exposed.
+
+    Reloading without patching __import__ runs both success branches.
+    """
+    saved = sys.modules.pop("agent_control.evaluators", None)
+    try:
+        import agent_control.evaluators as reloaded
+
+        reloaded = importlib.reload(reloaded)
+        # Sanity: at least one luna1 and one luna2 name should reappear.
+        assert "LunaEvaluator" in reloaded.__all__
+        assert "Luna2Evaluator" in reloaded.__all__
+    finally:
+        if saved is not None:
+            sys.modules["agent_control.evaluators"] = saved

From 9a94bba0eb4500f3e816e86008446289c26fb1d1 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Sat, 16 May 2026 06:41:50 -0700
Subject: [PATCH 16/20] move coervagera

---
 .../python/tests/test_evaluators_optional_imports.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sdks/python/tests/test_evaluators_optional_imports.py b/sdks/python/tests/test_evaluators_optional_imports.py
index a2842a1b..2bc2129e 100644
--- a/sdks/python/tests/test_evaluators_optional_imports.py
+++ b/sdks/python/tests/test_evaluators_optional_imports.py
@@ -10,8 +10,16 @@
 
 import builtins
 import importlib
+import importlib.util
 import sys
 
+import pytest
+
+_GALILEO_INSTALLED = (
+    importlib.util.find_spec("agent_control_evaluator_galileo.luna") is not None
+    and importlib.util.find_spec("agent_control_evaluator_galileo.luna2") is not None
+)
+
 
 def _reload_evaluators_with_blocked(prefix: str) -> object:
     """Reload ``agent_control.evaluators`` while ``prefix.*`` imports fail.
@@ -75,6 +83,10 @@ def test_module_loads_when_galileo_package_is_unavailable():
         assert absent not in reloaded.__all__
 
 
+@pytest.mark.skipif(
+    not _GALILEO_INSTALLED,
+    reason="agent-control-evaluator-galileo extras not installed in this environment",
+)
 def test_module_loads_galileo_optional_imports_when_available():
     """Sanity check: with galileo installed, the optional names ARE exposed.
 

From f0d11b7fede964cab123728788ebb7ea4fc07fa7 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Sat, 16 May 2026 06:45:41 -0700
Subject: [PATCH 17/20] failing test

---
 .../tests/test_evaluators_optional_imports.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/sdks/python/tests/test_evaluators_optional_imports.py b/sdks/python/tests/test_evaluators_optional_imports.py
index 2bc2129e..735164be 100644
--- a/sdks/python/tests/test_evaluators_optional_imports.py
+++ b/sdks/python/tests/test_evaluators_optional_imports.py
@@ -15,10 +15,21 @@
 
 import pytest
 
-_GALILEO_INSTALLED = (
-    importlib.util.find_spec("agent_control_evaluator_galileo.luna") is not None
-    and importlib.util.find_spec("agent_control_evaluator_galileo.luna2") is not None
-)
+
+def _module_available(name: str) -> bool:
+    """Return whether ``name`` resolves without raising for missing parents."""
+    try:
+        return importlib.util.find_spec(name) is not None
+    except (ImportError, ValueError):
+        # ``find_spec`` raises ModuleNotFoundError (a subclass of ImportError)
+        # when a *parent* package is missing, instead of returning None. Treat
+        # that as "not installed."
+        return False
+
+
+_GALILEO_INSTALLED = _module_available(
+    "agent_control_evaluator_galileo.luna"
+) and _module_available("agent_control_evaluator_galileo.luna2")
 
 
 def _reload_evaluators_with_blocked(prefix: str) -> object:

From f683dda765e3623d2fbac0a6da81d9aaf3157296 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Mon, 18 May 2026 10:26:50 -0700
Subject: [PATCH 18/20] add input text that goes into controls evaluators

---
 engine/src/agent_control_engine/core.py    |  1 +
 engine/tests/test_core.py                  | 42 ++++++++++++++++++++++
 server/src/agent_control_server/migrate.py |  2 ++
 server/tests/test_migrate.py               |  5 +++
 4 files changed, 50 insertions(+)

diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py
index 99c2273b..fdb323e0 100644
--- a/engine/src/agent_control_engine/core.py
+++ b/engine/src/agent_control_engine/core.py
@@ -224,6 +224,7 @@ async def _evaluate_leaf(
             "message": self._truncated_message(result.message),
         }
         metadata = dict(result.metadata or {})
+        metadata["selected_data"] = data
         metadata["condition_trace"] = trace
         return _ConditionEvaluation(
             result=result.model_copy(update={"metadata": metadata}),
diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py
index 78eda0ab..d2eb0871 100644
--- a/engine/tests/test_core.py
+++ b/engine/tests/test_core.py
@@ -1412,6 +1412,48 @@ async def test_or_short_circuit_records_skipped_trace(self):
         assert trace["children"][1]["matched"] is None
         assert trace["children"][1]["short_circuit_reason"] == "or_matched"
 
+    @pytest.mark.asyncio
+    async def test_leaf_metadata_includes_selector_selected_data(self):
+        """Leaf metadata should expose the value selected by selector.path."""
+        # Given: a leaf control selecting a nested step input value
+        controls = [
+            MockControlWithIdentity(
+                id=1,
+                name="city_control",
+                control=ControlDefinition(
+                    description="City guardrail",
+                    enabled=True,
+                    execution="server",
+                    scope={"step_types": ["tool"], "stages": ["pre"]},
+                    condition={
+                        "selector": {"path": "input.city"},
+                        "evaluator": {"name": "test-deny", "config": {"value": "match"}},
+                    },
+                    action={"decision": "observe"},
+                ),
+            )
+        ]
+        engine = ControlEngine(controls)
+
+        # When: processing a request where input.city has a concrete value
+        request = EvaluationRequest(
+            agent_name="00000000-0000-0000-0000-000000000001",
+            step=Step(
+                type="tool",
+                name="lookup-weather",
+                input={"city": "San Francisco"},
+                output=None,
+            ),
+            stage="pre",
+        )
+        result = await engine.process(request)
+
+        # Then: event reconstruction can use selected_data as ControlSpan.input
+        assert result.matches is not None
+        metadata = result.matches[0].result.metadata
+        assert metadata is not None
+        assert metadata["selected_data"] == "San Francisco"
+
     @pytest.mark.asyncio
     async def test_composite_results_preserve_decisive_child_metadata(self):
         """Composite results should retain structured metadata from the decisive child."""
diff --git a/server/src/agent_control_server/migrate.py b/server/src/agent_control_server/migrate.py
index 3f260d4a..48c775e0 100644
--- a/server/src/agent_control_server/migrate.py
+++ b/server/src/agent_control_server/migrate.py
@@ -110,6 +110,7 @@ def _acquire_migration_lock(connection: Connection, timeout_seconds: float) -> N
             ).scalar_one()
         )
         if acquired:
+            connection.commit()
             LOGGER.info("Acquired Agent Control migration advisory lock.")
             return
 
@@ -150,6 +151,7 @@ def _serialized_migration(cfg: Config, *, enabled: bool) -> Iterator[None]:
                         _MIGRATION_LOCK_PARAMS,
                     ).scalar_one()
                 )
+                connection.commit()
                 if released:
                     LOGGER.info("Released Agent Control migration advisory lock.")
                 else:
diff --git a/server/tests/test_migrate.py b/server/tests/test_migrate.py
index eaed9798..a82e6dd4 100644
--- a/server/tests/test_migrate.py
+++ b/server/tests/test_migrate.py
@@ -19,6 +19,7 @@ class _FakeConnection:
     def __init__(self, lock_results: list[bool]) -> None:
         self.lock_results = lock_results
         self.statements: list[str] = []
+        self.commits = 0
 
     def __enter__(self) -> _FakeConnection:
         return self
@@ -35,6 +36,9 @@ def execute(self, statement: object, params: object) -> _FakeResult:
             return _FakeResult(True)
         raise AssertionError(f"unexpected SQL statement: {statement_text}")
 
+    def commit(self) -> None:
+        self.commits += 1
+
 
 class _FakeEngine:
     def __init__(self, connection: _FakeConnection) -> None:
@@ -106,6 +110,7 @@ def test_serialized_migration_acquires_and_releases_postgres_lock(monkeypatch) -
         "SELECT pg_try_advisory_lock(:class_id, :object_id)",
         "SELECT pg_advisory_unlock(:class_id, :object_id)",
     ]
+    assert connection.commits == 2
     assert sleeps == [2.0]
     assert engine.disposed
 

From 7210fc10872902083b93ce1e0c340eaf52e580e9 Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Mon, 18 May 2026 10:59:05 -0700
Subject: [PATCH 19/20] add docstring

---
 engine/src/agent_control_engine/core.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py
index fdb323e0..2b2404a9 100644
--- a/engine/src/agent_control_engine/core.py
+++ b/engine/src/agent_control_engine/core.py
@@ -270,7 +270,19 @@ def _composite_metadata(
         *,
         matched: bool,
     ) -> dict[str, Any] | None:
-        """Select stable child metadata to preserve on composite results."""
+        """Select stable child metadata to preserve on composite results.
+
+        The selected_data value in this metadata is not all evaluator inputs.
+        It is the selected value from the leaf metadata the engine preserves for
+        the final composite result:
+        - or where one child matches: selected_data comes from the matching child.
+        - and where one child fails: selected_data comes from the failing child.
+        - and where all children match: selected_data comes from the first
+          matching child, usually the first leaf.
+        - or where no children match: selected_data comes from the first
+          evaluated child.
+        - not: selected_data comes from its child.
+        """
         source_result: EvaluatorResult | None = None
         if matched:
             source_result = next(

From f513dac2d90a70f2ba653cd267ace4947f40278b Mon Sep 17 00:00:00 2001
From: "namrata.ghadi" <namrata.ghadi@galileo.ai>
Date: Fri, 22 May 2026 12:19:00 -0700
Subject: [PATCH 20/20] address comments

---
 engine/src/agent_control_engine/core.py       | 135 ++++++++++++++++--
 engine/tests/test_core.py                     | 115 ++++++++++++++-
 .../luna/client.py                            |  76 ++++++++--
 .../luna/config.py                            |   9 ++
 .../luna/evaluator.py                         |  16 +--
 .../galileo/tests/test_luna_coverage_gaps.py  | 122 ++++++++++++++--
 .../galileo/tests/test_luna_evaluator.py      |   3 +
 examples/galileo_luna/README.md               |  23 ++-
 examples/galileo_luna/demo_agent.py           |  30 +++-
 examples/galileo_luna/setup_controls.py       |  37 +++--
 .../src/agent_control/evaluation_events.py    |  15 +-
 sdks/python/src/agent_control/otel_sink.py    |  10 ++
 .../tests/test_observability_updates.py       |  72 +++++++++-
 sdks/python/tests/test_otel_sink.py           |  22 ++-
 14 files changed, 616 insertions(+), 69 deletions(-)

diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py
index 2b2404a9..e2ae8b6e 100644
--- a/engine/src/agent_control_engine/core.py
+++ b/engine/src/agent_control_engine/core.py
@@ -33,6 +33,108 @@
 # Max concurrent evaluations (limits task spawning overhead for large policies)
 MAX_CONCURRENT_EVALUATIONS = int(os.environ.get("MAX_CONCURRENT_EVALUATIONS", "3"))
 
+SELECTED_DATA_PREVIEW_MAX_CHARS = int(
+    os.environ.get("AGENT_CONTROL_SELECTED_DATA_PREVIEW_MAX_CHARS", "500")
+)
+SELECTED_DATA_PREVIEW_MAX_ITEMS = int(
+    os.environ.get("AGENT_CONTROL_SELECTED_DATA_PREVIEW_MAX_ITEMS", "20")
+)
+SELECTED_DATA_PREVIEW_MAX_DEPTH = int(
+    os.environ.get("AGENT_CONTROL_SELECTED_DATA_PREVIEW_MAX_DEPTH", "3")
+)
+_SENSITIVE_KEY_PARTS = (
+    "api_key",
+    "apikey",
+    "authorization",
+    "credential",
+    "password",
+    "secret",
+    "token",
+)
+
+
+def _env_flag(name: str, *, default: bool = False) -> bool:
+    """Read a boolean environment flag."""
+    value = os.environ.get(name)
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+
+
+def _is_sensitive_key(key: object) -> bool:
+    """Return whether a mapping key is likely to contain a secret."""
+    normalized = str(key).lower()
+    return any(part in normalized for part in _SENSITIVE_KEY_PARTS)
+
+
+def _truncate_string(value: str, max_chars: int) -> tuple[str, bool]:
+    """Return a bounded string preview and whether it was truncated."""
+    if len(value) <= max_chars:
+        return value, False
+    if max_chars <= 3:
+        return value[:max_chars], True
+    return f"{value[: max_chars - 3]}...", True
+
+
+def _selected_data_preview_value(
+    value: Any,
+    *,
+    depth: int = 0,
+) -> tuple[Any, bool]:
+    """Build a bounded, redacted preview of selected data."""
+    if depth >= SELECTED_DATA_PREVIEW_MAX_DEPTH:
+        return "<max depth reached>", True
+
+    if value is None or isinstance(value, bool | int | float):
+        return value, False
+
+    if isinstance(value, str):
+        return _truncate_string(value, SELECTED_DATA_PREVIEW_MAX_CHARS)
+
+    if isinstance(value, dict):
+        preview: dict[str, Any] = {}
+        truncated = len(value) > SELECTED_DATA_PREVIEW_MAX_ITEMS
+        for index, (key, item) in enumerate(value.items()):
+            if index >= SELECTED_DATA_PREVIEW_MAX_ITEMS:
+                break
+            preview_key = str(key)
+            if _is_sensitive_key(key):
+                preview[preview_key] = "<redacted>"
+                truncated = True
+                continue
+            preview_item, item_truncated = _selected_data_preview_value(
+                item,
+                depth=depth + 1,
+            )
+            preview[preview_key] = preview_item
+            truncated = truncated or item_truncated
+        return preview, truncated
+
+    if isinstance(value, list | tuple):
+        preview_items: list[Any] = []
+        truncated = len(value) > SELECTED_DATA_PREVIEW_MAX_ITEMS
+        for item in value[:SELECTED_DATA_PREVIEW_MAX_ITEMS]:
+            preview_item, item_truncated = _selected_data_preview_value(
+                item,
+                depth=depth + 1,
+            )
+            preview_items.append(preview_item)
+            truncated = truncated or item_truncated
+        return preview_items, truncated
+
+    text_preview, truncated = _truncate_string(str(value), SELECTED_DATA_PREVIEW_MAX_CHARS)
+    return text_preview, truncated
+
+
+def _selected_data_preview(value: Any) -> dict[str, Any]:
+    """Return UI-safe selector output details for evaluator-level inspection."""
+    preview, truncated = _selected_data_preview_value(value)
+    return {
+        "type": type(value).__name__,
+        "value": preview,
+        "truncated": truncated,
+    }
+
 
 @functools.lru_cache(maxsize=256)
 def _compile_regex(pattern: str) -> Any:
@@ -102,9 +204,16 @@ def __init__(
         self,
         controls: Sequence[ControlWithIdentity],
         context: Literal["sdk", "server"] = "server",
+        *,
+        include_raw_selected_data: bool | None = None,
     ):
         self.controls = controls
         self.context = context
+        self.include_raw_selected_data = (
+            _env_flag("AGENT_CONTROL_INCLUDE_RAW_SELECTED_DATA")
+            if include_raw_selected_data is None
+            else include_raw_selected_data
+        )
 
     @staticmethod
     def _truncated_message(message: str | None) -> str | None:
@@ -224,7 +333,9 @@ async def _evaluate_leaf(
             "message": self._truncated_message(result.message),
         }
         metadata = dict(result.metadata or {})
-        metadata["selected_data"] = data
+        if self.include_raw_selected_data:
+            metadata["engine_selected_data"] = data
+        metadata["engine_selected_data_preview"] = _selected_data_preview(data)
         metadata["condition_trace"] = trace
         return _ConditionEvaluation(
             result=result.model_copy(update={"metadata": metadata}),
@@ -272,16 +383,18 @@ def _composite_metadata(
     ) -> dict[str, Any] | None:
         """Select stable child metadata to preserve on composite results.
 
-        The selected_data value in this metadata is not all evaluator inputs.
-        It is the selected value from the leaf metadata the engine preserves for
-        the final composite result:
-        - or where one child matches: selected_data comes from the matching child.
-        - and where one child fails: selected_data comes from the failing child.
-        - and where all children match: selected_data comes from the first
-          matching child, usually the first leaf.
-        - or where no children match: selected_data comes from the first
-          evaluated child.
-        - not: selected_data comes from its child.
+        The engine_selected_data_preview value in this metadata is not all
+        evaluator inputs. It is the bounded selected value preview from the leaf
+        metadata the engine preserves for the final composite result:
+        - or where one child matches: engine_selected_data_preview comes from the
+          matching child.
+        - and where one child fails: engine_selected_data_preview comes from the
+          failing child.
+        - and where all children match: engine_selected_data_preview comes from the
+          first matching child, usually the first leaf.
+        - or where no children match: engine_selected_data_preview comes from the
+          first evaluated child.
+        - not: engine_selected_data_preview comes from its child.
         """
         source_result: EvaluatorResult | None = None
         if matched:
diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py
index d2eb0871..ed4e6e00 100644
--- a/engine/tests/test_core.py
+++ b/engine/tests/test_core.py
@@ -157,7 +157,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
             matched=matched,
             confidence=0.8 if matched else 0.4,
             message=f"Metadata {self.config.value}",
-            metadata={"source": self.config.value, "selected_data": data},
+            metadata={"source": self.config.value, "selected_data": f"evaluator:{data}"},
         )
         _execution_log.append(f"metadata:{self.config.value}:end")
         return result
@@ -1413,8 +1413,8 @@ async def test_or_short_circuit_records_skipped_trace(self):
         assert trace["children"][1]["short_circuit_reason"] == "or_matched"
 
     @pytest.mark.asyncio
-    async def test_leaf_metadata_includes_selector_selected_data(self):
-        """Leaf metadata should expose the value selected by selector.path."""
+    async def test_leaf_metadata_includes_selector_selected_data_preview(self):
+        """Leaf metadata should expose a safe preview of the selected selector.path value."""
         # Given: a leaf control selecting a nested step input value
         controls = [
             MockControlWithIdentity(
@@ -1448,11 +1448,109 @@ async def test_leaf_metadata_includes_selector_selected_data(self):
         )
         result = await engine.process(request)
 
-        # Then: event reconstruction can use selected_data as ControlSpan.input
+        # Then: UI consumers can inspect the selected value without raw data export.
         assert result.matches is not None
         metadata = result.matches[0].result.metadata
         assert metadata is not None
-        assert metadata["selected_data"] == "San Francisco"
+        assert "selected_data" not in metadata
+        assert metadata["engine_selected_data_preview"] == {
+            "type": "str",
+            "value": "San Francisco",
+            "truncated": False,
+        }
+
+    @pytest.mark.asyncio
+    async def test_leaf_selected_data_preview_is_bounded_and_redacted(self):
+        """Selected data previews should cap payload size and redact secret-like keys."""
+        # Given: a leaf control selecting a large object with a secret-like key
+        controls = [
+            MockControlWithIdentity(
+                id=1,
+                name="payload_control",
+                control=ControlDefinition(
+                    description="Payload guardrail",
+                    enabled=True,
+                    execution="server",
+                    scope={"step_types": ["tool"], "stages": ["pre"]},
+                    condition={
+                        "selector": {"path": "input"},
+                        "evaluator": {"name": "test-deny", "config": {"value": "match"}},
+                    },
+                    action={"decision": "observe"},
+                ),
+            )
+        ]
+        engine = ControlEngine(controls)
+        request = EvaluationRequest(
+            agent_name="00000000-0000-0000-0000-000000000001",
+            step=Step(
+                type="tool",
+                name="send-payload",
+                input={
+                    "prompt": "x" * 600,
+                    "api_key": "secret-value",
+                },
+                output=None,
+            ),
+            stage="pre",
+        )
+
+        # When: processing the request
+        result = await engine.process(request)
+
+        # Then: the preview is useful for UI inspection but does not expose the raw payload.
+        assert result.matches is not None
+        metadata = result.matches[0].result.metadata
+        assert metadata is not None
+        preview = metadata["engine_selected_data_preview"]
+        assert preview["type"] == "dict"
+        assert preview["truncated"] is True
+        assert preview["value"]["api_key"] == "<redacted>"
+        assert preview["value"]["prompt"].endswith("...")
+        assert len(preview["value"]["prompt"]) == 500
+
+    @pytest.mark.asyncio
+    async def test_engine_selected_data_does_not_overwrite_evaluator_metadata(self):
+        """Engine-owned selector data should not collide with evaluator-owned metadata."""
+        # Given: an evaluator that deliberately returns its own selected_data key
+        controls = [
+            MockControlWithIdentity(
+                id=1,
+                name="metadata_control",
+                control=ControlDefinition(
+                    description="Metadata guardrail",
+                    enabled=True,
+                    execution="server",
+                    scope={"step_types": ["llm"], "stages": ["pre"]},
+                    condition={
+                        "selector": {"path": "input"},
+                        "evaluator": {"name": "test-metadata", "config": {"value": "match"}},
+                    },
+                    action={"decision": "observe"},
+                ),
+            )
+        ]
+        engine = ControlEngine(controls, include_raw_selected_data=True)
+        request = EvaluationRequest(
+            agent_name="00000000-0000-0000-0000-000000000001",
+            step=Step(type="llm", name="test-step", input="raw input", output=None),
+            stage="pre",
+        )
+
+        # When: processing the request
+        result = await engine.process(request)
+
+        # Then: evaluator-owned metadata remains intact and engine-owned data is namespaced.
+        assert result.matches is not None
+        metadata = result.matches[0].result.metadata
+        assert metadata is not None
+        assert metadata["selected_data"] == "evaluator:raw input"
+        assert metadata["engine_selected_data"] == "raw input"
+        assert metadata["engine_selected_data_preview"] == {
+            "type": "str",
+            "value": "raw input",
+            "truncated": False,
+        }
 
     @pytest.mark.asyncio
     async def test_composite_results_preserve_decisive_child_metadata(self):
@@ -1511,7 +1609,12 @@ async def test_composite_results_preserve_decisive_child_metadata(self):
         metadata = result.matches[0].result.metadata
         assert metadata is not None
         assert metadata["source"] == "match-right"
-        assert metadata["selected_data"] == "chosen"
+        assert metadata["selected_data"] == "evaluator:chosen"
+        assert metadata["engine_selected_data_preview"] == {
+            "type": "str",
+            "value": "chosen",
+            "truncated": False,
+        }
         assert metadata["condition_trace"]["type"] == "or"
         assert "slow:skip-tail:start" not in _execution_log
 
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
index 51d34c96..3bbc807f 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -9,6 +9,7 @@
 from hmac import new as hmac_new
 from json import dumps
 from time import time
+from typing import Literal
 
 import httpx
 from agent_control_models import JSONObject, JSONValue
@@ -20,6 +21,7 @@
 DEFAULT_INTERNAL_TOKEN_TTL_SECS = 3600
 PUBLIC_SCORER_INVOKE_PATH = "/scorers/invoke"
 INTERNAL_SCORER_INVOKE_PATH = "/internal/scorers/invoke"
+AuthMode = Literal["public", "internal"]
 
 
 def _b64url(data: bytes) -> str:
@@ -49,6 +51,18 @@ def _internal_auth_token(
     return f"{signing_input}.{_b64url(signature)}"
 
 
+def _env_auth_mode() -> AuthMode | None:
+    value = os.getenv("GALILEO_LUNA_AUTH_MODE")
+    if value is None or value.strip() == "":
+        return None
+    normalized = value.strip().lower()
+    if normalized == "public":
+        return "public"
+    if normalized == "internal":
+        return "internal"
+    raise ValueError("GALILEO_LUNA_AUTH_MODE must be either 'public' or 'internal'.")
+
+
 def _as_float_or_none(value: JSONValue) -> float | None:
     if isinstance(value, bool) or value is None:
         return None
@@ -151,6 +165,7 @@ class GalileoLunaClient:
     Environment Variables:
         GALILEO_API_SECRET_KEY or GALILEO_API_SECRET: Galileo API internal JWT signing secret.
         GALILEO_API_KEY: Galileo API key fallback for public scorer invocation.
+        GALILEO_LUNA_AUTH_MODE: Auth mode, either "public" or "internal".
         GALILEO_CONSOLE_URL: Galileo Console URL (optional, defaults to production).
     """
 
@@ -160,6 +175,7 @@ def __init__(
         api_secret: str | None = None,
         console_url: str | None = None,
         api_url: str | None = None,
+        auth_mode: AuthMode | None = None,
     ) -> None:
         """Initialize the Galileo Luna client.
 
@@ -171,22 +187,26 @@ def __init__(
                 GALILEO_CONSOLE_URL or uses the production console URL.
             api_url: Galileo API URL. If not provided, reads from GALILEO_API_URL
                 before deriving from the console URL.
+            auth_mode: Auth mode to use. If not provided, reads from
+                GALILEO_LUNA_AUTH_MODE, or infers from the single available credential.
 
         Raises:
-            ValueError: If neither API secret nor API key is provided.
+            ValueError: If credentials are missing, ambiguous, or incompatible with
+                the selected auth mode.
         """
         resolved_api_secret = (
             api_secret or os.getenv("GALILEO_API_SECRET_KEY") or os.getenv("GALILEO_API_SECRET")
         )
         resolved_api_key = api_key or os.getenv("GALILEO_API_KEY")
-        if not resolved_api_secret and not resolved_api_key:
-            raise ValueError(
-                "GALILEO_API_SECRET_KEY or GALILEO_API_KEY is required. "
-                "Set one as an environment variable or pass it to the constructor."
-            )
+        resolved_auth_mode = self._resolve_auth_mode(
+            auth_mode or _env_auth_mode(),
+            api_key=resolved_api_key,
+            api_secret=resolved_api_secret,
+        )
 
         self.api_key = resolved_api_key
         self.api_secret = resolved_api_secret
+        self.auth_mode = resolved_auth_mode
         self.console_url = (
             console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai"
         )
@@ -194,6 +214,44 @@ def __init__(
             "/"
         ) or self._derive_api_url(self.console_url)
         self._client: httpx.AsyncClient | None = None
+        logger.info("[GalileoLunaClient] Auth mode selected: %s", self.auth_mode)
+
+    @staticmethod
+    def _resolve_auth_mode(
+        auth_mode: AuthMode | None,
+        *,
+        api_key: str | None,
+        api_secret: str | None,
+    ) -> AuthMode:
+        if auth_mode == "public":
+            if not api_key:
+                raise ValueError(
+                    "GALILEO_API_KEY is required when GALILEO_LUNA_AUTH_MODE=public."
+                )
+            return "public"
+
+        if auth_mode == "internal":
+            if not api_secret:
+                raise ValueError(
+                    "GALILEO_API_SECRET_KEY or GALILEO_API_SECRET is required when "
+                    "GALILEO_LUNA_AUTH_MODE=internal."
+                )
+            return "internal"
+
+        if api_key and api_secret:
+            raise ValueError(
+                "Both Galileo API key and API secret are configured. Set "
+                "GALILEO_LUNA_AUTH_MODE to 'public' or 'internal' to choose the "
+                "runtime auth mode explicitly."
+            )
+        if api_secret:
+            return "internal"
+        if api_key:
+            return "public"
+        raise ValueError(
+            "GALILEO_API_SECRET_KEY or GALILEO_API_KEY is required. "
+            "Set one as an environment variable or pass it to the constructor."
+        )
 
     def _derive_api_url(self, console_url: str) -> str:
         """Derive the API URL from a Galileo Console URL."""
@@ -215,7 +273,7 @@ async def _get_client(self) -> httpx.AsyncClient:
         """Get or create the HTTP client."""
         if self._client is None or self._client.is_closed:
             headers = {"Content-Type": "application/json"}
-            if self.api_secret is None and self.api_key is not None:
+            if self.auth_mode == "public" and self.api_key is not None:
                 headers["Galileo-API-Key"] = self.api_key
             self._client = httpx.AsyncClient(
                 headers=headers,
@@ -228,9 +286,11 @@ def _endpoint_and_headers(
         headers: dict[str, str] | None,
     ) -> tuple[str, dict[str, str]]:
         request_headers = dict(headers or {})
-        if self.api_secret is None:
+        if self.auth_mode == "public":
             return f"{self.api_base}{PUBLIC_SCORER_INVOKE_PATH}", request_headers
 
+        if self.api_secret is None:
+            raise RuntimeError("Internal Luna auth mode is missing an API secret.")
         request_headers["Authorization"] = f"Bearer {_internal_auth_token(self.api_secret)}"
         return f"{self.api_base}{INTERNAL_SCORER_INVOKE_PATH}", request_headers
 
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
index c49dd716..788fa24c 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
@@ -9,6 +9,7 @@
 from pydantic import Field, model_validator
 
 LunaOperator = Literal["gt", "gte", "lt", "lte", "eq", "ne", "contains", "any"]
+LunaPayloadField = Literal["input", "output"]
 
 _NUMERIC_OPERATORS = frozenset({"gt", "gte", "lt", "lte"})
 
@@ -37,6 +38,7 @@ class LunaEvaluatorConfig(EvaluatorConfig):
         threshold: Local threshold used by the evaluator for comparison.
         operator: Local comparison operator. Numeric operators use threshold as a number.
         scorer_config: Optional scorer-specific config sent as ``config``.
+        payload_field: Explicit scorer input side for scalar selected data.
         timeout_ms: Request timeout in milliseconds.
     """
 
@@ -69,6 +71,13 @@ class LunaEvaluatorConfig(EvaluatorConfig):
         serialization_alias="config",
         description="Optional scorer-specific configuration sent to Galileo.",
     )
+    payload_field: LunaPayloadField = Field(
+        default="input",
+        description=(
+            "Which scorer input side to use when selector output is a scalar value. "
+            "Structured selected data with input/output keys overrides this setting."
+        ),
+    )
     timeout_ms: int = Field(
         default=10000,
         ge=1000,
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
index ce46cf44..7b48052f 100644
--- a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
+++ b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -61,8 +61,6 @@ def _contains(score: JSONValue, threshold: JSONValue) -> bool:
     if isinstance(score, list):
         return threshold in score
     if isinstance(score, dict):
-        if isinstance(threshold, str) and threshold in score:
-            return True
         return threshold in score.values()
     return False
 
@@ -116,12 +114,10 @@ def __init__(self, config: LunaEvaluatorConfig) -> None:
             )
 
         super().__init__(config)
-        self._client: GalileoLunaClient | None = None
+        self._client = GalileoLunaClient()
 
     def _get_client(self) -> GalileoLunaClient:
-        """Get or create the Galileo Luna client."""
-        if self._client is None:
-            self._client = GalileoLunaClient()
+        """Get the Galileo Luna client."""
         return self._client
 
     def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]:
@@ -133,8 +129,7 @@ def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]:
                 return input_text, output_text
 
         text = _coerce_payload_text(data)
-        scorer_label = self.config.scorer_label or ""
-        if "output" in scorer_label:
+        if self.config.payload_field == "output":
             return None, text
         return text, None
 
@@ -262,7 +257,6 @@ def _handle_error(
             confidence=0.0,
             message=f"Luna evaluation error: {error_detail}",
             metadata={
-                "error": error_detail,
                 "error_type": type(error).__name__,
                 "scorer_label": self.config.scorer_label,
                 "scorer_id": self.config.scorer_id,
@@ -273,6 +267,4 @@ def _handle_error(
 
     async def aclose(self) -> None:
         """Close the underlying Galileo Luna client."""
-        if self._client is not None:
-            await self._client.close()
-            self._client = None
+        await self._client.close()
diff --git a/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py b/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py
index f4d0e360..68755c99 100644
--- a/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py
+++ b/evaluators/contrib/galileo/tests/test_luna_coverage_gaps.py
@@ -76,7 +76,7 @@ def test_present_key_coerced(self):
 
 
 class TestContains:
-    """``_contains`` supports str/list/dict scores against a threshold."""
+    """``_contains`` supports str/list and dict values against a threshold."""
 
     def test_none_threshold_is_no_match(self):
         from agent_control_evaluator_galileo.luna.evaluator import _contains
@@ -95,10 +95,10 @@ def test_list_contains_value(self):
         assert _contains(["a", "b", "c"], "b") is True
         assert _contains(["a", "b", "c"], "z") is False
 
-    def test_dict_threshold_matches_key(self):
+    def test_dict_threshold_does_not_match_key(self):
         from agent_control_evaluator_galileo.luna.evaluator import _contains
 
-        assert _contains({"toxicity": 0.9}, "toxicity") is True
+        assert _contains({"toxicity": 0.9}, "toxicity") is False
 
     def test_dict_threshold_matches_value(self):
         from agent_control_evaluator_galileo.luna.evaluator import _contains
@@ -216,7 +216,7 @@ def test_numeric_operator_rejects_non_numeric_score(self, monkeypatch):
 
 
 class TestPreparePayload:
-    """``_prepare_payload`` routes scalar data based on the scorer label."""
+    """``_prepare_payload`` routes scalar data using explicit config."""
 
     def test_scalar_routed_to_input_when_label_lacks_output(self, monkeypatch):
         monkeypatch.setenv("GALILEO_API_KEY", "test-key")
@@ -229,12 +229,16 @@ def test_scalar_routed_to_input_when_label_lacks_output(self, monkeypatch):
         assert input_text == "hello"
         assert output_text is None
 
-    def test_scalar_routed_to_output_when_label_contains_output(self, monkeypatch):
+    def test_scalar_routed_to_output_when_payload_field_is_output(self, monkeypatch):
         monkeypatch.setenv("GALILEO_API_KEY", "test-key")
         from agent_control_evaluator_galileo.luna import LunaEvaluator
 
         evaluator = LunaEvaluator.from_dict(
-            {"scorer_label": "output_correctness", "threshold": 0.5}
+            {
+                "scorer_label": "toxicity",
+                "threshold": 0.5,
+                "payload_field": "output",
+            }
         )
 
         input_text, output_text = evaluator._prepare_payload("hello")
@@ -242,10 +246,45 @@ def test_scalar_routed_to_output_when_label_contains_output(self, monkeypatch):
         assert input_text is None
         assert output_text == "hello"
 
+    def test_scalar_output_label_without_payload_field_still_defaults_to_input(
+        self,
+        monkeypatch,
+    ):
+        monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+        evaluator = LunaEvaluator.from_dict(
+            {"scorer_label": "output_correctness", "threshold": 0.5}
+        )
+
+        input_text, output_text = evaluator._prepare_payload("hello")
+
+        assert input_text == "hello"
+        assert output_text is None
+
+    def test_structured_payload_uses_input_output_keys_over_payload_field(self, monkeypatch):
+        monkeypatch.setenv("GALILEO_API_KEY", "test-key")
+        from agent_control_evaluator_galileo.luna import LunaEvaluator
+
+        evaluator = LunaEvaluator.from_dict(
+            {
+                "scorer_label": "toxicity",
+                "threshold": 0.5,
+                "payload_field": "output",
+            }
+        )
+
+        input_text, output_text = evaluator._prepare_payload(
+            {"input": "prompt", "output": "answer"}
+        )
+
+        assert input_text == "prompt"
+        assert output_text == "answer"
+
 
 @pytest.mark.asyncio
 async def test_evaluator_aclose_closes_underlying_client(monkeypatch):
-    """``aclose`` must release the HTTP client when one was created."""
+    """``aclose`` must release the eagerly-created client without clearing it."""
     monkeypatch.setenv("GALILEO_API_KEY", "test-key")
     from agent_control_evaluator_galileo.luna import LunaEvaluator
 
@@ -258,7 +297,7 @@ async def test_evaluator_aclose_closes_underlying_client(monkeypatch):
     await evaluator.aclose()
 
     fake.close.assert_awaited_once()
-    assert evaluator._client is None
+    assert evaluator._client is fake
 
 
 @pytest.mark.asyncio
@@ -402,6 +441,7 @@ def test_client_raises_when_no_credentials(monkeypatch):
         "GALILEO_API_SECRET_KEY",
         "GALILEO_API_SECRET",
         "GALILEO_API_KEY",
+        "GALILEO_LUNA_AUTH_MODE",
     ):
         monkeypatch.delenv(name, raising=False)
     from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
@@ -410,10 +450,76 @@ def test_client_raises_when_no_credentials(monkeypatch):
         GalileoLunaClient()
 
 
+def test_client_requires_explicit_mode_when_both_credentials_are_present(monkeypatch):
+    """A mixed credential environment must not silently choose an auth route."""
+    monkeypatch.setenv("GALILEO_API_KEY", "public-key")
+    monkeypatch.setenv("GALILEO_API_SECRET_KEY", "internal-secret")
+    monkeypatch.delenv("GALILEO_LUNA_AUTH_MODE", raising=False)
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    with pytest.raises(ValueError, match="Both Galileo API key and API secret"):
+        GalileoLunaClient()
+
+
+def test_client_uses_explicit_public_mode_when_both_credentials_are_present(monkeypatch):
+    """Explicit public mode should use the API-key route even if a secret is also set."""
+    monkeypatch.setenv("GALILEO_API_KEY", "public-key")
+    monkeypatch.setenv("GALILEO_API_SECRET_KEY", "internal-secret")
+    monkeypatch.setenv("GALILEO_LUNA_AUTH_MODE", "public")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    client = GalileoLunaClient()
+
+    assert client.auth_mode == "public"
+    endpoint, request_headers = client._endpoint_and_headers(None)
+    assert endpoint.endswith("/scorers/invoke")
+    assert "Authorization" not in request_headers
+
+
+def test_client_uses_explicit_internal_mode_when_both_credentials_are_present(monkeypatch):
+    """Explicit internal mode should use the internal JWT route."""
+    monkeypatch.setenv("GALILEO_API_KEY", "public-key")
+    monkeypatch.setenv("GALILEO_API_SECRET_KEY", "internal-secret")
+    monkeypatch.setenv("GALILEO_LUNA_AUTH_MODE", "internal")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    client = GalileoLunaClient()
+
+    assert client.auth_mode == "internal"
+    endpoint, request_headers = client._endpoint_and_headers(None)
+    assert endpoint.endswith("/internal/scorers/invoke")
+    assert request_headers["Authorization"].startswith("Bearer ")
+
+
+def test_client_rejects_mode_without_matching_credential(monkeypatch):
+    """The selected mode must have its matching credential configured."""
+    monkeypatch.delenv("GALILEO_API_SECRET_KEY", raising=False)
+    monkeypatch.delenv("GALILEO_API_SECRET", raising=False)
+    monkeypatch.setenv("GALILEO_API_KEY", "public-key")
+    monkeypatch.setenv("GALILEO_LUNA_AUTH_MODE", "internal")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY"):
+        GalileoLunaClient()
+
+
+def test_client_rejects_invalid_auth_mode(monkeypatch):
+    """Invalid auth mode values should fail during client initialization."""
+    monkeypatch.setenv("GALILEO_API_KEY", "public-key")
+    monkeypatch.setenv("GALILEO_LUNA_AUTH_MODE", "sideways")
+    from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
+
+    with pytest.raises(ValueError, match="GALILEO_LUNA_AUTH_MODE"):
+        GalileoLunaClient()
+
+
 class TestDeriveApiUrl:
     """URL derivation covers every console.* → api.* substitution branch."""
 
     def _client(self, monkeypatch):
+        monkeypatch.delenv("GALILEO_API_SECRET_KEY", raising=False)
+        monkeypatch.delenv("GALILEO_API_SECRET", raising=False)
+        monkeypatch.delenv("GALILEO_LUNA_AUTH_MODE", raising=False)
         monkeypatch.setenv("GALILEO_API_KEY", "test-key")
         from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
 
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
index 1b7a6e94..e0cd2051 100644
--- a/evaluators/contrib/galileo/tests/test_luna_evaluator.py
+++ b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -42,6 +42,7 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
         assert config.threshold == 0.7
         assert config.operator == "gte"
         assert config.scorer_config == {"temperature": 0}
+        assert config.payload_field == "input"
 
     def test_config_accepts_scorer_id_without_label(self) -> None:
         from agent_control_evaluator_galileo.luna import LunaEvaluatorConfig
@@ -475,4 +476,6 @@ async def test_evaluator_fail_open_sets_error(self) -> None:
         assert result.matched is False
         assert result.error == "service unavailable"
         assert result.metadata is not None
+        assert "error" not in result.metadata
+        assert result.metadata["error_type"] == "RuntimeError"
         assert "fallback_action" not in result.metadata
diff --git a/examples/galileo_luna/README.md b/examples/galileo_luna/README.md
index 534ef640..5ac97cda 100644
--- a/examples/galileo_luna/README.md
+++ b/examples/galileo_luna/README.md
@@ -17,26 +17,43 @@ Start the Agent Control server from the repo root:
 make server-run
 ```
 
-Configure Galileo:
+Configure Galileo public API-key auth:
 
 ```bash
+export GALILEO_LUNA_AUTH_MODE="public"
 export GALILEO_API_KEY="your-api-key"
 export GALILEO_CONSOLE_URL="https://console.demo-v2.galileocloud.io"
 ```
 
-If the scorer requires explicit project resolution, set:
+For internal deployments, use internal auth instead:
 
 ```bash
-export GALILEO_PROJECT_ID="00000000-0000-0000-0000-000000000000"
+export GALILEO_LUNA_AUTH_MODE="internal"
+export GALILEO_API_SECRET_KEY="your-api-secret"
+export GALILEO_API_URL="https://api.default.svc.cluster.local:8088"
 ```
 
 Optional scorer settings:
 
 ```bash
 export GALILEO_LUNA_SCORER_LABEL="toxicity"
+# Or select by scorer id/version instead of label:
+# export GALILEO_LUNA_SCORER_ID="scorer-id"
+# export GALILEO_LUNA_SCORER_VERSION_ID="scorer-version-id"
 export GALILEO_LUNA_THRESHOLD="0.5"
+export GALILEO_LUNA_PAYLOAD_FIELD="output"
 ```
 
+`GALILEO_LUNA_PAYLOAD_FIELD` is explicit for scalar selected data. This example
+selects the agent's drafted reply with `selector.path="output"`, so it sends that
+scalar as the scorer `output` field. If a selector returns structured data with
+`input` and/or `output` keys, those keys are sent directly and override
+`GALILEO_LUNA_PAYLOAD_FIELD`.
+
+If both `GALILEO_API_KEY` and `GALILEO_API_SECRET_KEY`/`GALILEO_API_SECRET` are
+set, `GALILEO_LUNA_AUTH_MODE` is required so the client does not silently choose
+an auth path.
+
 Run:
 
 ```bash
diff --git a/examples/galileo_luna/demo_agent.py b/examples/galileo_luna/demo_agent.py
index 878023cf..8c7f59b2 100644
--- a/examples/galileo_luna/demo_agent.py
+++ b/examples/galileo_luna/demo_agent.py
@@ -4,7 +4,7 @@
 Prerequisites:
     1. Start server: make server-run
     2. Create controls: uv run python setup_controls.py
-    3. Set GALILEO_API_KEY where this script runs
+    3. Set Galileo credentials where this script runs
 
 Usage:
     uv run python demo_agent.py
@@ -21,6 +21,7 @@
 
 AGENT_NAME = "galileo-luna-agent"
 SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000")
+LUNA_AUTH_MODE = os.getenv("GALILEO_LUNA_AUTH_MODE")
 
 logging.basicConfig(
     level=logging.INFO,
@@ -90,9 +91,29 @@ def init_agent() -> None:
 
 async def run_demo() -> None:
     """Run scripted scenarios."""
-    if not os.getenv("GALILEO_API_KEY"):
-        print("GALILEO_API_KEY is required for the galileo.luna evaluator.")
-        print("Set it before running this demo.")
+    api_key = os.getenv("GALILEO_API_KEY")
+    api_secret = os.getenv("GALILEO_API_SECRET_KEY") or os.getenv("GALILEO_API_SECRET")
+    if not api_key and not api_secret:
+        print(
+            "Galileo credentials are required for the galileo.luna evaluator. "
+            "Set GALILEO_API_KEY for public mode or GALILEO_API_SECRET_KEY for "
+            "internal mode."
+        )
+        return
+    if api_key and api_secret and LUNA_AUTH_MODE not in {"public", "internal"}:
+        print(
+            "Both GALILEO_API_KEY and GALILEO_API_SECRET_KEY/GALILEO_API_SECRET are set. "
+            "Set GALILEO_LUNA_AUTH_MODE to 'public' or 'internal'."
+        )
+        return
+    if LUNA_AUTH_MODE == "public" and not api_key:
+        print("GALILEO_API_KEY is required when GALILEO_LUNA_AUTH_MODE=public.")
+        return
+    if LUNA_AUTH_MODE == "internal" and not api_secret:
+        print(
+            "GALILEO_API_SECRET_KEY or GALILEO_API_SECRET is required when "
+            "GALILEO_LUNA_AUTH_MODE=internal."
+        )
         return
 
     print("=" * 72)
@@ -100,6 +121,7 @@ async def run_demo() -> None:
     print("=" * 72)
     print(f"Server: {SERVER_URL}")
     print(f"Agent:  {AGENT_NAME}")
+    print(f"Auth:   GALILEO_LUNA_AUTH_MODE={LUNA_AUTH_MODE or '(auto if one credential)'}")
     print()
 
     init_agent()
diff --git a/examples/galileo_luna/setup_controls.py b/examples/galileo_luna/setup_controls.py
index 69a36ad5..fb4c6c76 100644
--- a/examples/galileo_luna/setup_controls.py
+++ b/examples/galileo_luna/setup_controls.py
@@ -3,8 +3,9 @@
 
 Prerequisites:
     - Agent Control server running at AGENT_CONTROL_URL, default http://localhost:8000
-    - GALILEO_API_KEY set where demo_agent.py will run
-    - Optional GALILEO_PROJECT_ID for project-scoped scorer resolution
+    - Galileo credentials set where demo_agent.py will run:
+      GALILEO_API_KEY with GALILEO_LUNA_AUTH_MODE=public, or
+      GALILEO_API_SECRET_KEY/GALILEO_API_SECRET with GALILEO_LUNA_AUTH_MODE=internal
 
 Usage:
     uv run python setup_controls.py
@@ -24,8 +25,14 @@
 SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000")
 
 LUNA_SCORER_LABEL = os.getenv("GALILEO_LUNA_SCORER_LABEL", "toxicity")
+LUNA_SCORER_ID = os.getenv("GALILEO_LUNA_SCORER_ID")
+LUNA_SCORER_VERSION_ID = os.getenv("GALILEO_LUNA_SCORER_VERSION_ID")
 LUNA_THRESHOLD = float(os.getenv("GALILEO_LUNA_THRESHOLD", "0.5"))
-GALILEO_PROJECT_ID = os.getenv("GALILEO_PROJECT_ID")
+LUNA_PAYLOAD_FIELD = os.getenv("GALILEO_LUNA_PAYLOAD_FIELD", "output")
+LUNA_AUTH_MODE = os.getenv("GALILEO_LUNA_AUTH_MODE")
+
+if LUNA_PAYLOAD_FIELD not in {"input", "output"}:
+    raise ValueError("GALILEO_LUNA_PAYLOAD_FIELD must be either 'input' or 'output'.")
 
 DEMO_STEPS = [
     {
@@ -41,14 +48,16 @@
 def luna_config() -> dict[str, Any]:
     """Build the direct Luna evaluator config used by the composite control."""
     config: dict[str, Any] = {
-        "scorer_label": LUNA_SCORER_LABEL,
         "threshold": LUNA_THRESHOLD,
         "operator": "gte",
-        "payload_field": "output",
-        "on_error": "allow",
+        "payload_field": LUNA_PAYLOAD_FIELD,
     }
-    if GALILEO_PROJECT_ID:
-        config["project_id"] = GALILEO_PROJECT_ID
+    if LUNA_SCORER_LABEL:
+        config["scorer_label"] = LUNA_SCORER_LABEL
+    if LUNA_SCORER_ID:
+        config["scorer_id"] = LUNA_SCORER_ID
+    if LUNA_SCORER_VERSION_ID:
+        config["scorer_version_id"] = LUNA_SCORER_VERSION_ID
     return config
 
 
@@ -158,9 +167,15 @@ async def setup_demo() -> None:
     print("Setting up direct Galileo Luna demo controls")
     print(f"Server: {SERVER_URL}")
     print(f"Agent:  {AGENT_NAME}")
-    print(f"Luna:   scorer_label={LUNA_SCORER_LABEL!r}, threshold={LUNA_THRESHOLD}")
-    if GALILEO_PROJECT_ID:
-        print(f"Project ID: {GALILEO_PROJECT_ID}")
+    print(
+        "Luna:   "
+        f"scorer_label={LUNA_SCORER_LABEL!r}, "
+        f"scorer_id={LUNA_SCORER_ID!r}, "
+        f"scorer_version_id={LUNA_SCORER_VERSION_ID!r}, "
+        f"threshold={LUNA_THRESHOLD}, "
+        f"payload_field={LUNA_PAYLOAD_FIELD!r}"
+    )
+    print(f"Auth:   GALILEO_LUNA_AUTH_MODE={LUNA_AUTH_MODE or '(auto if one credential)'}")
 
     async with AgentControlClient(base_url=SERVER_URL, timeout=30.0) as client:
         await client.health_check()
diff --git a/sdks/python/src/agent_control/evaluation_events.py b/sdks/python/src/agent_control/evaluation_events.py
index 0efe6e86..a0b37a03 100644
--- a/sdks/python/src/agent_control/evaluation_events.py
+++ b/sdks/python/src/agent_control/evaluation_events.py
@@ -22,6 +22,19 @@
 _FALLBACK_TRACE_ID = "0" * 32
 _FALLBACK_SPAN_ID = "0" * 16
 _trace_warning_logged = False
+_DEBUG_METADATA_KEYS = frozenset(
+    {
+        "selected_data",
+        "selected_data_preview",
+        "engine_selected_data",
+        "engine_selected_data_preview",
+    }
+)
+
+
+def _safe_event_metadata(metadata: dict[str, object]) -> dict[str, object]:
+    """Drop raw/debug metadata that should not be exported as observability data."""
+    return {key: value for key, value in metadata.items() if key not in _DEBUG_METADATA_KEYS}
 
 
 def observability_metadata(
@@ -88,7 +101,7 @@ def _build_events_for_matches(
 
     for match in matches:
         control_def = control_lookup.get(match.control_id)
-        event_metadata = dict(match.result.metadata or {})
+        event_metadata = _safe_event_metadata(dict(match.result.metadata or {}))
         selector_path = None
         evaluator_name = None
 
diff --git a/sdks/python/src/agent_control/otel_sink.py b/sdks/python/src/agent_control/otel_sink.py
index a238dac6..e724f5af 100644
--- a/sdks/python/src/agent_control/otel_sink.py
+++ b/sdks/python/src/agent_control/otel_sink.py
@@ -28,6 +28,14 @@
     "OpenTelemetry sink selected but no OTLP exporter configuration was found; "
     "control events will not be exported"
 )
+_DEBUG_METADATA_ATTRIBUTE_KEYS = frozenset(
+    {
+        "selected_data",
+        "selected_data_preview",
+        "engine_selected_data",
+        "engine_selected_data_preview",
+    }
+)
 
 AttributeValue = str | bool | int | float | list[str] | list[bool] | list[int] | list[float]
 
@@ -129,6 +137,8 @@ def control_event_to_otel_span(event: ControlExecutionEvent) -> OTELControlEvent
         attributes["agent_control.error_message"] = event.error_message
 
     for key, value in sorted(event.metadata.items()):
+        if key in _DEBUG_METADATA_ATTRIBUTE_KEYS:
+            continue
         attributes[f"agent_control.metadata.{key}"] = _normalize_attribute_value(value)
 
     return OTELControlEventSpan(
diff --git a/sdks/python/tests/test_observability_updates.py b/sdks/python/tests/test_observability_updates.py
index 181d3c6c..dd7f5d2f 100644
--- a/sdks/python/tests/test_observability_updates.py
+++ b/sdks/python/tests/test_observability_updates.py
@@ -67,14 +67,21 @@ def _make_response(self, **kwargs):
         defaults.update(kwargs)
         return EvaluationResponse(**defaults)
 
-    def _make_match(self, control_id, control_name="ctrl", action="observe", matched=True):
+    def _make_match(
+        self,
+        control_id,
+        control_name="ctrl",
+        action="observe",
+        matched=True,
+        metadata=None,
+    ):
         from agent_control_models import ControlMatch, EvaluatorResult
 
         return ControlMatch(
             control_id=control_id,
             control_name=control_name,
             action=action,
-            result=EvaluatorResult(matched=matched, confidence=0.9),
+            result=EvaluatorResult(matched=matched, confidence=0.9, metadata=metadata),
         )
 
     def test_combines_matches_errors_and_non_matches(self):
@@ -172,14 +179,21 @@ def _make_request(self, step_type="llm"):
             stage="pre",
         )
 
-    def _make_match(self, control_id, control_name="ctrl", action="observe", matched=True):
+    def _make_match(
+        self,
+        control_id,
+        control_name="ctrl",
+        action="observe",
+        matched=True,
+        metadata=None,
+    ):
         from agent_control_models import ControlMatch, EvaluatorResult
 
         return ControlMatch(
             control_id=control_id,
             control_name=control_name,
             action=action,
-            result=EvaluatorResult(matched=matched, confidence=0.9),
+            result=EvaluatorResult(matched=matched, confidence=0.9, metadata=metadata),
         )
 
     def _make_response(self, matches=None, errors=None, non_matches=None):
@@ -224,6 +238,56 @@ def test_builds_events_with_trace_context(self):
         assert event.evaluator_name == "regex"
         assert event.selector_path == "input"
 
+    def test_drops_raw_selected_data_from_event_metadata(self):
+        response = self._make_response(
+            matches=[
+                self._make_match(
+                    1,
+                    "ctrl-1",
+                    metadata={
+                        "selected_data": {"prompt": "raw sensitive input"},
+                        "selected_data_preview": {
+                            "type": "dict",
+                            "value": {"prompt": "raw sensitive input"},
+                            "truncated": False,
+                        },
+                        "engine_selected_data": {"prompt": "raw sensitive input"},
+                        "engine_selected_data_preview": {
+                            "type": "dict",
+                            "value": {"prompt": "raw sensitive input"},
+                            "truncated": False,
+                        },
+                    },
+                )
+            ]
+        )
+        request = self._make_request()
+        control_lookup = {
+            1: self._make_control(
+                1,
+                "ctrl-1",
+                {
+                    "evaluator": {"name": "regex", "config": {"pattern": "test"}},
+                    "selector": {"path": "input"},
+                },
+            ).control
+        }
+
+        events = build_control_execution_events(
+            response,
+            request,
+            control_lookup,
+            "trace123",
+            "span456",
+            "test-agent",
+        )
+
+        assert len(events) == 1
+        assert "selected_data" not in events[0].metadata
+        assert "selected_data_preview" not in events[0].metadata
+        assert "engine_selected_data" not in events[0].metadata
+        assert "engine_selected_data_preview" not in events[0].metadata
+
     def test_composite_control_uses_representative_observability_identity(self):
         response = self._make_response(non_matches=[self._make_match(1, "ctrl-1", matched=False)])
         request = self._make_request()
diff --git a/sdks/python/tests/test_otel_sink.py b/sdks/python/tests/test_otel_sink.py
index 6f1c81fd..4d4aa451 100644
--- a/sdks/python/tests/test_otel_sink.py
+++ b/sdks/python/tests/test_otel_sink.py
@@ -39,7 +39,23 @@ def _make_event(**overrides: object) -> ControlExecutionEvent:
         evaluator_name="regex",
         selector_path="input",
         error_message=None,
-        metadata={"labels": ["security", "pii"], "threshold": 3, "nested": {"k": "v"}},
+        metadata={
+            "labels": ["security", "pii"],
+            "threshold": 3,
+            "nested": {"k": "v"},
+            "selected_data": {"prompt": "raw sensitive input"},
+            "selected_data_preview": {
+                "type": "dict",
+                "value": {"prompt": "raw sensitive input"},
+                "truncated": False,
+            },
+            "engine_selected_data": {"prompt": "raw sensitive input"},
+            "engine_selected_data_preview": {
+                "type": "dict",
+                "value": {"prompt": "raw sensitive input"},
+                "truncated": False,
+            },
+        },
     )
     return event.model_copy(update=overrides)
 
@@ -227,6 +243,10 @@ def test_control_event_to_otel_span_maps_event_fields() -> None:
     assert span.attributes["agent_control.matched"] is True
     assert span.attributes["agent_control.metadata.labels"] == ["security", "pii"]
     assert span.attributes["agent_control.metadata.nested"] == '{"k": "v"}'
+    assert "agent_control.metadata.selected_data" not in span.attributes
+    assert "agent_control.metadata.selected_data_preview" not in span.attributes
+    assert "agent_control.metadata.engine_selected_data" not in span.attributes
+    assert "agent_control.metadata.engine_selected_data_preview" not in span.attributes
     assert span.error_message == "blocked"
     assert span.end_time_unix_nano >= span.start_time_unix_nano