diff --git a/.github/workflows/harness-integration.yml b/.github/workflows/harness-integration.yml new file mode 100644 index 000000000..51893f10f --- /dev/null +++ b/.github/workflows/harness-integration.yml @@ -0,0 +1,40 @@ +name: Harness Integration + +on: + push: + branches: [main] + pull_request: + paths: + - "src/agentex/lib/core/harness/**" + - "src/agentex/lib/adk/_modules/**" + - ".github/workflows/harness-integration.yml" + +jobs: + conformance: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install uv + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 + with: + version: '0.10.2' + + - name: Bootstrap + run: ./scripts/bootstrap + + # Defer to scripts/test so the harness suite runs under the exact same + # invocation as the main CI test job: DEFER_PYDANTIC_BUILD=false and + # `uv run --isolated --all-packages --all-extras pytest`, across the + # min/max supported Python versions. Running `uv run pytest` directly + # would risk an all-extras-only dep passing locally but failing in CI. + - name: Conformance suite + run: ./scripts/test tests/lib/core/harness/ -v + + # Live integration matrix (harness x {sync, async, temporal}) is added per-harness + # in the migration plans. Placeholder job keeps the workflow valid until then. + live-matrix: + runs-on: ubuntu-latest + if: false # enabled once the first harness's test agents land + steps: + - run: echo "populated by migration PRs" # TODO(harness-migration): enable per-harness; see migration PRs 4-8 diff --git a/src/agentex/lib/core/harness/__init__.py b/src/agentex/lib/core/harness/__init__.py new file mode 100644 index 000000000..067751d63 --- /dev/null +++ b/src/agentex/lib/core/harness/__init__.py @@ -0,0 +1,30 @@ +"""Shared, harness-independent machinery for the unified harness surface. + +The Agentex StreamTaskMessage* stream is the single source of truth; this +package derives spans from it and delivers it (yield or auto-send), so every +harness tap gets streaming + tracing + turn usage uniformly. +""" + +from agentex.lib.core.harness.types import ( + OpenSpan, + CloseSpan, + TurnUsage, + SpanSignal, + TurnResult, + HarnessTurn, + StreamTaskMessage, +) +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter + +__all__ = [ + "UnifiedEmitter", + "SpanTracer", + "OpenSpan", + "CloseSpan", + "SpanSignal", + "StreamTaskMessage", + "TurnUsage", + "TurnResult", + "HarnessTurn", +] diff --git a/src/agentex/lib/core/harness/auto_send.py b/src/agentex/lib/core/harness/auto_send.py new file mode 100644 index 000000000..2ecd6b583 --- /dev/null +++ b/src/agentex/lib/core/harness/auto_send.py @@ -0,0 +1,156 @@ +"""Auto-send delivery: canonical stream -> adk.streaming side effects + tracing.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator +from datetime import datetime + +from agentex.types.text_delta import TextDelta +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnUsage, TurnResult, StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.core.harness.span_derivation import SpanDeriver + +try: + from agentex.lib.utils.logging import make_logger + + logger = make_logger(__name__) +except Exception: # ddtrace may be absent in some envs; fall back to stdlib + import logging + + logger = logging.getLogger(__name__) + + +async def auto_send( + events: AsyncIterator[StreamTaskMessage], + task_id: str, + tracer: SpanTracer | None = None, + streaming: Any = None, + usage: TurnUsage | None = None, + created_at: datetime | None = None, +) -> TurnResult: + """Push the canonical stream to the task stream via adk.streaming. + + Opens a streaming context per message (keyed by index), streams deltas via + ctx.stream_update, and closes via ctx.close() on Done. Posts tool + request/response full messages by opening a context with the content and + closing it immediately (no deltas). Derives and traces spans from the same + stream. Returns the last text segment's text + usage. + + Index-keyed routing: each Start(index=i) opens a context stored in + ctx_map[i]; Delta(index=i) routes to ctx_map.get(i); Done(index=i) closes + and removes ctx_map[i]. Events with index is None are skipped. The finally + block closes all remaining open contexts. + + final_text last-segment semantics: a new Start(TextContent) resets + final_text_parts so that multi-step turns return the LAST text segment. + Full(TextContent) also overwrites final_text_parts (same semantics). + + AGX1-378: created_at is forwarded to every streaming_task_message_context + call so callers can back-date message timestamps. + + Mirrors the open/close/stream_update pattern from + src/agentex/lib/adk/_modules/_langgraph_async.py: + - context opened via streaming_task_message_context(...).__aenter__() + - context closed via ctx.close() (not __aexit__) + - deltas pushed as StreamTaskMessageDelta with parent_task_message set + from ctx.task_message + + For async + temporal agents (call from inside an activity). + """ + if streaming is None: + from agentex.lib import adk + + streaming = adk.streaming + + deriver = SpanDeriver() if tracer is not None else None + final_text_parts: list[str] = [] + ctx_map: dict[int, Any] = {} + + async def _close_all() -> None: + # Guard each close independently: a failure on one context (e.g. a + # backend hiccup during teardown) must not abandon the remaining open + # contexts, otherwise their task messages would never be finalized. + for ctx in list(ctx_map.values()): + try: + await ctx.close() + except Exception as exc: + logger.warning("[harness.auto_send] context close failed during teardown: %s", exc) + ctx_map.clear() + + try: + async for event in events: + if deriver is not None and tracer is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) + + if isinstance(event, StreamTaskMessageStart): + if event.index is None: + continue + i = event.index + # Reset final_text_parts when a new text segment starts + if isinstance(event.content, TextContent): + final_text_parts = [] + ctx = streaming.streaming_task_message_context( + task_id=task_id, + initial_content=event.content, + created_at=created_at, + ) + ctx_map[i] = await ctx.__aenter__() + + elif isinstance(event, StreamTaskMessageDelta): + if event.index is None: + continue + ctx = ctx_map.get(event.index) + if ctx is not None and event.delta is not None: + # Reconstruct the delta with parent_task_message set from + # the context's task_message (mirrors _langgraph_async.py + # lines 72-78 and 117-127). + delta_with_parent = StreamTaskMessageDelta( + parent_task_message=ctx.task_message, + delta=event.delta, + type="delta", + index=event.index, + ) + await ctx.stream_update(delta_with_parent) + if isinstance(event.delta, TextDelta) and event.delta.text_delta: + final_text_parts.append(event.delta.text_delta) + + elif isinstance(event, StreamTaskMessageDone): + if event.index is None: + continue + ctx = ctx_map.pop(event.index, None) + if ctx is not None: + await ctx.close() + + elif isinstance(event, StreamTaskMessageFull): + # Full messages: post the full message by opening a context + # with the content and closing it immediately (no deltas; + # StreamingTaskMessageContext.close() persists initial_content + # when the accumulator is empty). Use async with so the context + # is closed even if close() raises (__aexit__ delegates to + # close()). + # Full(TextContent) also resets final_text_parts for + # last-segment semantics. + if isinstance(event.content, TextContent): + final_text_parts = [event.content.content] + async with streaming.streaming_task_message_context( + task_id=task_id, + initial_content=event.content, + created_at=created_at, + ): + pass + + finally: + await _close_all() + if deriver is not None and tracer is not None: + for signal in deriver.flush(): + await tracer.handle(signal) + + return TurnResult(final_text="".join(final_text_parts), usage=usage or TurnUsage()) diff --git a/src/agentex/lib/core/harness/emitter.py b/src/agentex/lib/core/harness/emitter.py new file mode 100644 index 000000000..5b56793bf --- /dev/null +++ b/src/agentex/lib/core/harness/emitter.py @@ -0,0 +1,80 @@ +"""UnifiedEmitter: the single facade agent authors use for either delivery mode.""" + +from __future__ import annotations + +from typing import AsyncGenerator +from datetime import datetime + +from agentex.lib.core.harness.types import TurnResult, HarnessTurn, StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.auto_send import auto_send +from agentex.lib.core.harness.yield_delivery import yield_events + + +class UnifiedEmitter: + """Ties trace context + chosen delivery together. + + Tracing modes (the `tracer` arg): + - tracer=None (default): auto-construct a SpanTracer if `trace_id` is present. + - tracer=False: disable tracing entirely, regardless of `trace_id`. + - tracer=: use the supplied instance. + + `tracing` and `streaming` are injection escape-hatches for tests/advanced + use; leave them None in production so the real adk modules are used. + """ + + tracer: SpanTracer | None + + def __init__( + self, + task_id: str, + trace_id: str | None, + parent_span_id: str | None, + tracer: SpanTracer | bool | None = None, + tracing: object | None = None, + streaming: object | None = None, + ): + self.task_id = task_id + self.trace_id = trace_id + self.parent_span_id = parent_span_id + self._streaming = streaming + if tracer is False: + self.tracer = None + elif isinstance(tracer, SpanTracer): + self.tracer = tracer + elif trace_id: + self.tracer = SpanTracer( + trace_id=trace_id, + parent_span_id=parent_span_id, + task_id=task_id, + tracing=tracing, + ) + else: + self.tracer = None + + async def yield_turn(self, turn: HarnessTurn) -> AsyncGenerator[StreamTaskMessage, None]: + """Sync HTTP ACP delivery: forward events, trace as side effect.""" + async for event in yield_events(turn.events, tracer=self.tracer): + yield event + + async def auto_send_turn(self, turn: HarnessTurn, created_at: datetime | None = None) -> TurnResult: + """Async/temporal delivery: push to the task stream, return TurnResult. + + Pass `created_at` (e.g. `workflow.now()` under Temporal) to stamp the + turn's messages with a deterministic timestamp; it is forwarded to the + streaming contexts. Default None preserves server-side timestamps. + """ + # `turn.usage()` is only valid AFTER `turn.events` is exhausted (the + # HarnessTurn single-pass contract: real turns populate usage while the + # stream is consumed). So drive delivery first, then read usage — do NOT + # pass `usage=turn.usage()` eagerly here (that would capture the empty + # default before the stream runs). + result = await auto_send( + turn.events, + task_id=self.task_id, + tracer=self.tracer, + streaming=self._streaming, + created_at=created_at, + ) + result.usage = turn.usage() + return result diff --git a/src/agentex/lib/core/harness/span_derivation.py b/src/agentex/lib/core/harness/span_derivation.py new file mode 100644 index 000000000..cecb24bcc --- /dev/null +++ b/src/agentex/lib/core/harness/span_derivation.py @@ -0,0 +1,154 @@ +"""Pure reducer: canonical StreamTaskMessage* stream -> span open/close signals. + +Has no dependency on adk; unit-testable in isolation. Delivery adapters feed it +every event and act on the returned signals. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass + +from agentex.lib.core.harness.types import OpenSpan, CloseSpan, SpanSignal, StreamTaskMessage +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + + +@dataclass +class _ToolReqMeta: + tool_call_id: str + name: str + arguments: dict[str, object] + args_buf: str = "" # accumulated streamed argument fragments + + +class SpanDeriver: + """Stateful reducer over the canonical stream. + + Tool span: open on Done of a ToolRequestContent index; close on matching + ToolResponseContent by tool_call_id. Reasoning span: open on + Start(ReasoningContent); close on that index's Done. + + Deliberate contracts: + - A `Full(ToolResponseContent)` whose tool_call_id was never opened is + ignored (no CloseSpan emitted). + - A `Done` for an index that was never a tool_request/reasoning Start is + ignored (no signal emitted). + - Events with `index is None` are skipped entirely; without a stable index + they cannot be reliably paired, and aliasing them to a sentinel would + let unrelated None-indexed events cross-match. + - `flush()` closes anything still open as incomplete; unclosed tool spans + are emitted in the order they were opened. + """ + + def __init__(self) -> None: + self._tool_by_index: dict[int, _ToolReqMeta] = {} + self._reasoning_index_open: set[int] = set() + # insertion-ordered set of open tool_call_ids (dict keys preserve order) + self._open_tool_ids: dict[str, None] = {} + + def observe(self, event: StreamTaskMessage) -> list[SpanSignal]: + if isinstance(event, StreamTaskMessageStart): + return self._on_start(event) + if isinstance(event, StreamTaskMessageDelta): + return self._on_delta(event) + if isinstance(event, StreamTaskMessageFull): + return self._on_full(event) + if isinstance(event, StreamTaskMessageDone): + return self._on_done(event) + return [] + + def flush(self) -> list[SpanSignal]: + """Close anything still open at end of stream, marked incomplete.""" + signals: list[SpanSignal] = [] + for tcid in list(self._open_tool_ids): + signals.append(CloseSpan(key=tcid, output=None, is_complete=False)) + self._open_tool_ids.clear() + for idx in sorted(self._reasoning_index_open): + signals.append(CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=False)) + self._reasoning_index_open.clear() + return signals + + def _on_start(self, event: StreamTaskMessageStart) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index + content = event.content + if isinstance(content, ToolRequestContent): + self._tool_by_index[idx] = _ToolReqMeta( + tool_call_id=content.tool_call_id, + name=content.name, + arguments=dict(content.arguments or {}), + ) + return [] + if content.type == "reasoning": + self._reasoning_index_open.add(idx) + return [OpenSpan(key=f"reasoning:{idx}", kind="reasoning", name="reasoning", input={})] + return [] + + def _on_delta(self, event: StreamTaskMessageDelta) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index + delta = event.delta + if isinstance(delta, ToolRequestDelta): + meta = self._tool_by_index.get(idx) + if meta is not None and delta.arguments_delta: + meta.args_buf += delta.arguments_delta + return [] + + def _on_full(self, event: StreamTaskMessageFull) -> list[SpanSignal]: + """Handle a Full event. + + A `Full(ToolRequestContent)` opens a tool span (keyed by tool_call_id) + if it is not already open; the matching `Full(ToolResponseContent)` + closes it. This handles harnesses (e.g. LangGraph) that emit tool calls + as a single Full rather than Start+Done. + """ + content = event.content + if isinstance(content, ToolRequestContent): + tcid = content.tool_call_id + if tcid not in self._open_tool_ids: + self._open_tool_ids[tcid] = None + args = dict(content.arguments or {}) + return [OpenSpan(key=tcid, kind="tool", name=content.name, input=args)] + return [] + if isinstance(content, ToolResponseContent): + tcid = content.tool_call_id + if tcid in self._open_tool_ids: + self._open_tool_ids.pop(tcid, None) + return [ + CloseSpan( + key=tcid, + output=content.content, + is_complete=True, + is_error=content.is_error, + ) + ] + return [] + + def _on_done(self, event: StreamTaskMessageDone) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index + meta = self._tool_by_index.pop(idx, None) + if meta is not None: + args = meta.arguments + if meta.args_buf: + try: + args = json.loads(meta.args_buf) + except json.JSONDecodeError: + args = {"_raw": meta.args_buf} + self._open_tool_ids[meta.tool_call_id] = None + return [OpenSpan(key=meta.tool_call_id, kind="tool", name=meta.name, input=args)] + if idx in self._reasoning_index_open: + self._reasoning_index_open.discard(idx) + return [CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=True)] + return [] diff --git a/src/agentex/lib/core/harness/tracer.py b/src/agentex/lib/core/harness/tracer.py new file mode 100644 index 000000000..4ca4d628b --- /dev/null +++ b/src/agentex/lib/core/harness/tracer.py @@ -0,0 +1,88 @@ +"""Adapter from SpanSignals to adk.tracing spans (best-effort, overridable).""" + +from __future__ import annotations + +from typing import Any + +from agentex.lib.core.harness.types import OpenSpan, CloseSpan, SpanSignal + +try: + from agentex.lib.utils.logging import make_logger + + logger = make_logger(__name__) +except Exception: # ddtrace may be absent in some envs; fall back to stdlib + import logging + + logger = logging.getLogger(__name__) + + +class SpanTracer: + """Opens/closes adk.tracing child spans in response to span signals. + + `tracing` defaults to the real `adk.tracing` module; inject a fake in tests + or a custom tracer to override. No-op when `trace_id` is falsy. Never raises. + + The real TracingModule.end_span does NOT accept an output kwarg — output is + recorded by mutating span.output before calling end_span, matching the pattern + used throughout the codebase (see _langgraph_tracing.py on_tool_end etc.). + + Span-lifecycle contract: the `_open` dict (span key -> span object) is scoped + to a single turn. Pairing is by `key`: + - A duplicate OpenSpan for a key already in `_open` silently replaces the + earlier span; the earlier span is then orphaned (never closed / leaked). + - A CloseSpan for an unknown key is a no-op. + - Unpaired opens accumulate in `_open` for the lifetime of the tracer; since + a tracer is expected to live for one turn, this is bounded and acceptable. + """ + + def __init__( + self, + trace_id: str | None, + parent_span_id: str | None, + tracing: Any = None, + task_id: str | None = None, + ): + self.trace_id = trace_id + self.parent_span_id = parent_span_id + self.task_id = task_id + if tracing is None: + from agentex.lib import adk + + tracing = adk.tracing + self._tracing = tracing + self._open: dict[str, Any] = {} # span key -> span object + + async def handle(self, signal: SpanSignal) -> None: + if not self.trace_id: + return + try: + if isinstance(signal, OpenSpan): + span = await self._tracing.start_span( + trace_id=self.trace_id, + name=signal.name, + input=signal.input, + parent_id=self.parent_span_id, + task_id=self.task_id, + ) + if span is not None: + self._open[signal.key] = span + elif isinstance(signal, CloseSpan): + span = self._open.pop(signal.key, None) + if span is not None: + # Output is recorded by mutating span.output before end_span. + # The real TracingModule.end_span signature is: + # end_span(trace_id, span, start_to_close_timeout, heartbeat_timeout, retry_policy) + # It does not accept an output= kwarg. + span.output = signal.output + # Tool failure status (ToolResponseContent.is_error) is recorded + # on span.data when the harness reports one; Span has no dedicated + # error field. None means no status was reported, so leave data alone. + if signal.is_error is not None: + data = span.data if isinstance(span.data, dict) else {} + span.data = {**data, "is_error": signal.is_error} + await self._tracing.end_span( + trace_id=self.trace_id, + span=span, + ) + except Exception as exc: # best-effort: tracing never breaks delivery + logger.warning("[harness.tracer] span signal failed: %s", exc) diff --git a/src/agentex/lib/core/harness/types.py b/src/agentex/lib/core/harness/types.py new file mode 100644 index 000000000..b37dc1e51 --- /dev/null +++ b/src/agentex/lib/core/harness/types.py @@ -0,0 +1,93 @@ +"""Types for the unified harness surface.""" + +from __future__ import annotations + +from typing import Any, Union, Literal, Protocol, AsyncIterator, runtime_checkable +from dataclasses import field, dataclass + +from pydantic import BaseModel, ConfigDict + +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) + +# The canonical stream element. Taps yield these; delivery adapters consume them. +StreamTaskMessage = Union[ + StreamTaskMessageStart, + StreamTaskMessageDelta, + StreamTaskMessageFull, + StreamTaskMessageDone, +] + +SpanKind = Literal["tool", "reasoning", "subagent"] + + +@dataclass +class OpenSpan: + """Signal to open a child span. `key` pairs an open with its close.""" + + key: str + kind: SpanKind + name: str + input: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class CloseSpan: + """Signal to close the span previously opened with the same `key`.""" + + key: str + output: Any = None + is_complete: bool = True # False when closed by flush() without a result + is_error: bool | None = None # tool failure status; None when the harness reports no status + + +SpanSignal = Union[OpenSpan, CloseSpan] + + +class TurnUsage(BaseModel): + """Harness-independent turn usage/cost, attached to the turn span. + + Token field names align with agentex.lib.core.observability.llm_metrics. + """ + + model_config = ConfigDict(from_attributes=True, populate_by_name=True) + + model: str | None = None + input_tokens: int | None = None + output_tokens: int | None = None + cached_input_tokens: int | None = None + reasoning_tokens: int | None = None + total_tokens: int | None = None + cost_usd: float | None = None + duration_ms: int | None = None + num_llm_calls: int = 0 + num_tool_calls: int = 0 + num_reasoning_blocks: int = 0 + + +class TurnResult(BaseModel): + """Returned to the caller after a turn is delivered.""" + + model_config = ConfigDict(from_attributes=True, populate_by_name=True) + + final_text: str = "" + usage: TurnUsage = TurnUsage() + + +@runtime_checkable +class HarnessTurn(Protocol): + """A single harness turn: a canonical stream plus its normalized usage. + + Python async generators cannot cleanly return a value to their consumer, so + a tap exposes usage via `usage()` (valid only after `events` is exhausted) + rather than via StopAsyncIteration. + """ + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: ... + + def usage(self) -> TurnUsage: ... diff --git a/src/agentex/lib/core/harness/yield_delivery.py b/src/agentex/lib/core/harness/yield_delivery.py new file mode 100644 index 000000000..69b39f152 --- /dev/null +++ b/src/agentex/lib/core/harness/yield_delivery.py @@ -0,0 +1,31 @@ +"""Yield delivery: pass the canonical stream through, tracing as a side effect.""" + +from __future__ import annotations + +from typing import AsyncIterator, AsyncGenerator + +from agentex.lib.core.harness.types import StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.span_derivation import SpanDeriver + + +async def yield_events( + events: AsyncIterator[StreamTaskMessage], + tracer: SpanTracer | None = None, +) -> AsyncGenerator[StreamTaskMessage, None]: + """Forward each event to the caller; derive + trace spans as a side effect. + + For sync HTTP ACP agents that yield events back over the response. When + `tracer` is None, this is a pure passthrough. + """ + deriver = SpanDeriver() if tracer is not None else None + try: + async for event in events: + if deriver is not None and tracer is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) + yield event + finally: + if deriver is not None and tracer is not None: + for signal in deriver.flush(): + await tracer.handle(signal) diff --git a/tests/lib/core/harness/__init__.py b/tests/lib/core/harness/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lib/core/harness/conformance/__init__.py b/tests/lib/core/harness/conformance/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lib/core/harness/conformance/runner.py b/tests/lib/core/harness/conformance/runner.py new file mode 100644 index 000000000..81a74860c --- /dev/null +++ b/tests/lib/core/harness/conformance/runner.py @@ -0,0 +1,48 @@ +"""Shared conformance engine: every harness tap registers fixtures here. + +A fixture is (name, list[StreamTaskMessage]). The runner asserts that span +derivation over the events is identical regardless of delivery channel, which is +the cross-channel guarantee from the spec. + +Registry shared-state hazard: `_REGISTRY` is process-global. Every `test_*.py` +module that calls `register()` at import time contributes to it, so a module +that parametrizes over `all_fixtures()` will see fixtures registered by ANY +other conformance module imported earlier in the same pytest process (collection +order is not guaranteed). To stay deterministic, each future harness conformance +module should register and parametrize over its OWN fixtures (e.g. keep a +module-local list it both registers and parametrizes), rather than relying on +cross-module global accumulation via `all_fixtures()`. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from agentex.lib.core.harness.types import SpanSignal, StreamTaskMessage +from agentex.lib.core.harness.span_derivation import SpanDeriver + + +@dataclass +class Fixture: + name: str + events: list[StreamTaskMessage] + + +_REGISTRY: list[Fixture] = [] + + +def register(fixture: Fixture) -> None: + _REGISTRY.append(fixture) + + +def all_fixtures() -> list[Fixture]: + return list(_REGISTRY) + + +def derive_all(events: list[StreamTaskMessage]) -> list[SpanSignal]: + d = SpanDeriver() + out: list[SpanSignal] = [] + for e in events: + out.extend(d.observe(e)) + out.extend(d.flush()) + return out diff --git a/tests/lib/core/harness/conformance/test_conformance.py b/tests/lib/core/harness/conformance/test_conformance.py new file mode 100644 index 000000000..d9eec1c15 --- /dev/null +++ b/tests/lib/core/harness/conformance/test_conformance.py @@ -0,0 +1,43 @@ +import pytest + +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + +from .runner import Fixture, register, derive_all, all_fixtures + +register( + Fixture( + name="builtin-single-tool", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="c", name="Bash", content="ok" + ), + ), + ], + ) +) + + +@pytest.mark.parametrize("fixture", all_fixtures(), ids=lambda f: f.name) +def test_span_derivation_is_deterministic(fixture): + """Exercises the cross-channel guarantee: yield and auto-send observe the + same event stream, so span derivation must be deterministic/idempotent.""" + # Deriving twice over the same events yields identical signals (the property + # that makes yield vs auto-send equivalent, since both observe the same stream). + assert derive_all(fixture.events) == derive_all(fixture.events) diff --git a/tests/lib/core/harness/test_auto_send.py b/tests/lib/core/harness/test_auto_send.py new file mode 100644 index 000000000..1948e9196 --- /dev/null +++ b/tests/lib/core/harness/test_auto_send.py @@ -0,0 +1,490 @@ +"""Tests for auto_send delivery adapter. + +The fake mirrors the real StreamingTaskMessageContext API exactly: +- streaming_task_message_context(...) returns a context object (synchronously) +- open the context via __aenter__ (returns self after creating the task message) +- stream deltas via ctx.stream_update(StreamTaskMessageDelta(...)) +- close via ctx.close() (NOT __aexit__) + +This mirrors _langgraph_async.py lines 62-78 and 100-127. +""" + +import types as _types +from datetime import datetime + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_delta import TextDelta +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.core.harness.auto_send import auto_send +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + + +class _FakeCtx: + """Mirrors StreamingTaskMessageContext: __aenter__ opens (returns self with task_message set), + close() closes. stream_update records the call. + + task_message is a real TaskMessage instance so that auto_send can use it + as parent_task_message in StreamTaskMessageDelta without Pydantic validation errors. + """ + + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + # Real TaskMessage so StreamTaskMessageDelta(parent_task_message=...) passes validation + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + # __aexit__ delegates to close in the real impl; keep for safety + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + """Mirrors StreamingService: streaming_task_message_context returns a context object.""" + + def __init__(self): + self.sink = [] + self.recorded_created_at: list[datetime | None] = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + self.recorded_created_at.append(created_at) + return _FakeCtx(self.sink, ctype, initial_content) + + +async def _gen(events): + for e in events: + yield e + + +# --------------------------------------------------------------------------- +# Test 1: text streaming — open, stream deltas, close; return accumulated text +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_streams_text_and_returns_final_text(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hel"), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="lo"), + ), + StreamTaskMessageDone(type="done", index=0), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "Hello" + + kinds = [s[0] for s in streaming.sink] + # A context was created for the text content + assert kinds[0] == "ctx" + # It was opened and closed + assert "open" in kinds + assert "close" in kinds + # Exactly two updates were streamed (one per delta) + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 2 + + +# --------------------------------------------------------------------------- +# Test 2: tool_request Full + tool_response Full — each posts one full message +# (open context with the content, no deltas, close immediately) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_posts_full_tool_messages(): + streaming = _FakeStreaming() + events = [ + # Two Full events post two messages (open+close immediately, no deltas). + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c1", + name="Bash", + arguments={"cmd": "ls"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="c1", + name="Bash", + content="file.py", + ), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "" + + # Each Full event opens and closes exactly one context. + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 2 + content_types = [s[1] for s in ctx_events] + assert content_types == ["tool_request", "tool_response"] + + # Each context is opened and closed + opens = [s for s in streaming.sink if s[0] == "open"] + closes = [s for s in streaming.sink if s[0] == "close"] + assert len(opens) == 2 + assert len(closes) == 2 + + # No stream_update calls (full messages have no deltas) + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 0 + + +# --------------------------------------------------------------------------- +# Test 3: tracing — spans are derived and handed to the tracer +# --------------------------------------------------------------------------- + + +class _RecordTracing: + def __init__(self): + self.started, self.ended = [], [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append(name) + return _types.SimpleNamespace() + + async def end_span(self, *, trace_id, span): + self.ended.append(getattr(span, "output", None)) + + +@pytest.mark.asyncio +async def test_auto_send_derives_tool_spans_via_tracer(): + fake_tracing = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake_tracing) + streaming = _FakeStreaming() + + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c1", + name="Bash", + arguments={}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="c1", + name="Bash", + content="ok", + ), + ), + ] + + result = await auto_send(_gen(events), task_id="task1", tracer=tracer, streaming=streaming) + + assert result.final_text == "" + assert fake_tracing.started == ["Bash"] + assert fake_tracing.ended == ["ok"] + + +# --------------------------------------------------------------------------- +# Test 4: text followed by a tool Full — text context is closed before Full +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_closes_text_context_before_full_message(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hi"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c2", + name="read_file", + arguments={}, + ), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "Hi" + + # Verify ordering: text ctx opens, updates, closes; then tool_request ctx opens, closes + event_sequence = [(s[0], s[1]) for s in streaming.sink] + text_open_idx = next(i for i, s in enumerate(event_sequence) if s == ("open", "text")) + text_close_idx = next(i for i, s in enumerate(event_sequence) if s == ("close", "text")) + tool_open_idx = next(i for i, s in enumerate(event_sequence) if s == ("open", "tool_request")) + assert text_open_idx < text_close_idx < tool_open_idx + + +# --------------------------------------------------------------------------- +# Test 5: midstream error — propagates AND the open context is closed (finally) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_open_context_closed_on_midstream_error(): + streaming = _FakeStreaming() + + async def _exploding_gen(): + yield StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ) + raise RuntimeError("boom") + + with pytest.raises(RuntimeError, match="boom"): + await auto_send(_exploding_gen(), task_id="task1", tracer=None, streaming=streaming) + + # The text context that was opened mid-stream was closed by the finally block. + assert ("open", "text") in [(s[0], s[1]) for s in streaming.sink] + assert ("close", "text") in [(s[0], s[1]) for s in streaming.sink] + + +# --------------------------------------------------------------------------- +# Test 6: streamed tool_request delivered (AGX1-377 core) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_streams_tool_request(): + """A Start(ToolRequestContent) MUST open a streaming context (AGX1-377).""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c_tool", + name="Bash", + arguments={}, + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta( + type="tool_request", + tool_call_id="c_tool", + name="Bash", + arguments_delta='{"cmd": "ls"}', + ), + ), + StreamTaskMessageDone(type="done", index=0), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "" + + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 1 + assert ctx_events[0][1] == "tool_request" + + opens = [s for s in streaming.sink if s[0] == "open"] + closes = [s for s in streaming.sink if s[0] == "close"] + assert len(opens) == 1 + assert len(closes) == 1 + + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 1 + + +# --------------------------------------------------------------------------- +# Test 7: interleaved indexes route correctly +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_interleaved_indexes_route_correctly(): + """Deltas must be routed to the correct index-keyed context.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="A"), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="B"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageDone(type="done", index=1), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 2 + + opens = [s for s in streaming.sink if s[0] == "open"] + assert len(opens) == 2 + + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 2 + + update_deltas = [s[1].delta for s in streaming.sink if s[0] == "update"] + text_deltas = [d.text_delta for d in update_deltas if isinstance(d, TextDelta)] + assert set(text_deltas) == {"A", "B"} + + +# --------------------------------------------------------------------------- +# Test 8: final_text returns last text segment for multi-step +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_final_text_last_segment(): + """final_text must be the LAST text segment, not accumulated across all turns.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="First"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="Second"), + ), + StreamTaskMessageDone(type="done", index=1), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "Second" + + +# --------------------------------------------------------------------------- +# Test 9: Full(TextContent) contributes to final_text +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_full_text_content_sets_final_text(): + """A Full(TextContent) must contribute its text to final_text.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=TextContent(type="text", author="agent", content="hello"), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "hello" + + +# --------------------------------------------------------------------------- +# Test 10: created_at is forwarded to streaming context (AGX1-378) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_created_at_forwarded(): + """created_at must be forwarded to every streaming_task_message_context call.""" + streaming = _FakeStreaming() + dt = datetime(2025, 1, 15, 12, 0, 0) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c_ts", + name="Bash", + arguments={}, + ), + ), + ] + await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming, created_at=dt) + + assert all(ts == dt for ts in streaming.recorded_created_at) diff --git a/tests/lib/core/harness/test_emitter.py b/tests/lib/core/harness/test_emitter.py new file mode 100644 index 000000000..df155ec44 --- /dev/null +++ b/tests/lib/core/harness/test_emitter.py @@ -0,0 +1,148 @@ +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnUsage +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) + + +class _FakeTracing: + async def start_span(self, **kw): + return None + + async def end_span(self, **kw): + pass + + +class _FakeCtx: + """Minimal StreamingTaskMessageContext fake (see test_auto_send.py).""" + + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +class _Turn: + def __init__(self, events_list, usage): + self._events_list = events_list + self._usage = usage + + @property + async def events(self): + for e in self._events_list: + yield e + + def usage(self): + return self._usage + + +@pytest.mark.asyncio +async def test_emitter_yield_mode_passes_through(): + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _Turn(events, TurnUsage(model="m")) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(turn)] + assert out == events + + +@pytest.mark.asyncio +async def test_emitter_tracing_default_on_when_trace_id_present(): + # Inject a fake tracing backend so the test env doesn't need temporalio. + # This exercises the default-on path (tracer=None) when trace_id is truthy. + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracing=_FakeTracing()) + assert emitter.tracer is not None + + +@pytest.mark.asyncio +async def test_emitter_tracing_overridable_off(): + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracer=False) + assert emitter.tracer is None + + +@pytest.mark.asyncio +async def test_emitter_auto_send_turn_returns_usage(): + usage = TurnUsage(model="m", input_tokens=5) + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hello")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _Turn(events, usage) + fake = _FakeStreaming() + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None, streaming=fake) + result = await emitter.auto_send_turn(turn) + assert result.usage == usage + assert result.final_text == "Hello" + + +class _ContractTurn: + """A turn that honors the single-pass contract: usage() is the empty default + UNTIL `events` is exhausted, then the real usage (this is how real harness + turns behave — they populate usage while the stream is consumed).""" + + def __init__(self, events_list, real_usage): + self._events_list = events_list + self._real_usage = real_usage + self._exhausted = False + + @property + async def events(self): + for e in self._events_list: + yield e + self._exhausted = True + + def usage(self): + return self._real_usage if self._exhausted else TurnUsage(model="m") + + +@pytest.mark.asyncio +async def test_emitter_auto_send_turn_reads_usage_after_exhaustion(): + # Regression: auto_send_turn must read turn.usage() AFTER consuming the + # stream, not eagerly when building the auto_send call (which would capture + # the empty default and lose real token usage on the auto_send path). + real_usage = TurnUsage(model="m", input_tokens=11, output_tokens=22, total_tokens=33, num_llm_calls=2) + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _ContractTurn(events, real_usage) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None, streaming=_FakeStreaming()) + result = await emitter.auto_send_turn(turn) + assert result.usage == real_usage + assert result.usage.input_tokens == 11 and result.usage.total_tokens == 33 diff --git a/tests/lib/core/harness/test_span_derivation.py b/tests/lib/core/harness/test_span_derivation.py new file mode 100644 index 000000000..51e2ede2c --- /dev/null +++ b/tests/lib/core/harness/test_span_derivation.py @@ -0,0 +1,286 @@ +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.core.harness.span_derivation import SpanDeriver + + +def _signals(deriver, events): + out = [] + for e in events: + out.extend(deriver.observe(e)) + out.extend(deriver.flush()) + return out + + +def _tool_req(idx, tcid, name, args): + return StreamTaskMessageStart( + type="start", + index=idx, + content=ToolRequestContent(type="tool_request", author="agent", tool_call_id=tcid, name=name, arguments=args), + ) + + +def test_text_only_yields_no_spans(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=None), + StreamTaskMessageDone(type="done", index=0), + ] + assert _signals(d, events) == [] + + +def test_single_tool_opens_on_done_closes_on_response(): + d = SpanDeriver() + events = [ + _tool_req(0, "call_1", "Bash", {"cmd": "ls"}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="call_1", name="Bash", content="files" + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}), + CloseSpan(key="call_1", output="files", is_complete=True), + ] + # No status reported -> CloseSpan carries is_error=None. + assert sigs[1].is_error is None + + +def test_tool_response_is_error_propagates_to_close_span(): + """ToolResponseContent.is_error flows onto the CloseSpan so a derived tool + span can be marked as a failure (AGX1-371).""" + d = SpanDeriver() + events = [ + _tool_req(0, "call_err", "Bash", {"cmd": "false"}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_err", + name="Bash", + content="boom", + is_error=True, + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_err", kind="tool", name="Bash", input={"cmd": "false"}), + CloseSpan(key="call_err", output="boom", is_complete=True, is_error=True), + ] + + +def test_reasoning_opens_on_start_closes_on_done(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", index=0, content=ReasoningContent(type="reasoning", author="agent", summary=[], content=[]) + ), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="reasoning:0", kind="reasoning", name="reasoning", input={}) + assert sigs[1] == CloseSpan(key="reasoning:0", output=None, is_complete=True) + + +def test_parallel_tools_pair_by_tool_call_id(): + d = SpanDeriver() + events = [ + _tool_req(0, "a", "T1", {}), + _tool_req(1, "b", "T2", {}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageDone(type="done", index=1), + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="b", name="T2", content="rb" + ), + ), + StreamTaskMessageFull( + type="full", + index=3, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="a", name="T1", content="ra" + ), + ), + ] + sigs = _signals(d, events) + opens = [s for s in sigs if isinstance(s, OpenSpan)] + closes = [s for s in sigs if isinstance(s, CloseSpan)] + assert {o.key for o in opens} == {"a", "b"} + assert [c.key for c in closes] == ["b", "a"] + assert all(c.is_complete for c in closes) + + +def test_streamed_args_accumulate_into_open_input(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", arguments_delta='{"cmd":'), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", arguments_delta='"ls"}'), + ), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="c", kind="tool", name="Bash", input={"cmd": "ls"}) + + +def test_unclosed_tool_closed_incomplete_on_flush(): + d = SpanDeriver() + events = [ + _tool_req(0, "x", "Bash", {}), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="x", kind="tool", name="Bash", input={}) + assert sigs[1] == CloseSpan(key="x", output=None, is_complete=False) + + +def test_none_index_is_skipped(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=None, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="n", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=None), + ] + assert _signals(d, events) == [] + + +def test_orphan_tool_response_ignored(): + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="z", name="Bash", content="r" + ), + ), + ] + assert _signals(d, events) == [] + + +def test_full_tool_request_opens_span(): + """Full(ToolRequestContent) must open a tool span (for LangGraph-style harnesses).""" + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_x", + name="Bash", + arguments={"cmd": "ls"}, + ), + ), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="call_x", kind="tool", name="Bash", input={"cmd": "ls"}) + assert sigs[1] == CloseSpan(key="call_x", output=None, is_complete=False) + + +def test_full_tool_request_and_response_paired(): + """Full(ToolRequestContent) + Full(ToolResponseContent) produces a complete span pair.""" + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_y", + name="Grep", + arguments={}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_y", + name="Grep", + content="result", + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_y", kind="tool", name="Grep", input={}), + CloseSpan(key="call_y", output="result", is_complete=True), + ] + + +def test_full_tool_request_does_not_double_open(): + """A Full(ToolRequestContent) for an already-open tool_call_id is a no-op.""" + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_z", + name="X", + arguments={}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_z", + name="X", + arguments={}, + ), + ), + ] + sigs = _signals(d, events) + opens = [s for s in sigs if isinstance(s, OpenSpan)] + assert len(opens) == 1 + assert opens[0].key == "call_z" diff --git a/tests/lib/core/harness/test_tracer.py b/tests/lib/core/harness/test_tracer.py new file mode 100644 index 000000000..ed40cf595 --- /dev/null +++ b/tests/lib/core/harness/test_tracer.py @@ -0,0 +1,93 @@ +from typing import override + +import pytest + +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.lib.core.harness.tracer import SpanTracer + + +class _FakeSpan: + def __init__(self, name): + self.name = name + self.output = None + self.data = None + + +class _FakeTracing: + def __init__(self): + self.started = [] + self.ended = [] + self.ended_spans = [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append((name, parent_id, input)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id, span): + self.ended.append((span.name, span.output)) + self.ended_spans.append(span) + + +@pytest.mark.asyncio +async def test_open_then_close_starts_and_ends_span(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"})) + await tracer.handle(CloseSpan(key="call_1", output="files", is_complete=True)) + assert fake.started == [("Bash", "p1", {"cmd": "ls"})] + assert fake.ended == [("Bash", "files")] + + +@pytest.mark.asyncio +async def test_close_records_is_error_on_span_data(): + """A CloseSpan carrying is_error records the status on span.data (AGX1-371).""" + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_err", kind="tool", name="Bash", input={})) + await tracer.handle(CloseSpan(key="call_err", output="boom", is_complete=True, is_error=True)) + assert fake.ended_spans[0].data == {"is_error": True} + + +@pytest.mark.asyncio +async def test_close_without_status_leaves_span_data_untouched(): + """is_error=None (no status reported) must not write to span.data.""" + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_1", kind="tool", name="Bash", input={})) + await tracer.handle(CloseSpan(key="call_1", output="files", is_complete=True)) + assert fake.ended_spans[0].data is None + + +@pytest.mark.asyncio +async def test_no_trace_id_is_noop(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="", parent_span_id=None, tracing=fake) + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) + assert fake.started == [] and fake.ended == [] + + +@pytest.mark.asyncio +async def test_tracing_failure_is_swallowed(): + class _Boom(_FakeTracing): + @override + async def start_span(self, **kw): + raise RuntimeError("backend down") + + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=_Boom()) + # Must not raise. + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) + assert tracer._open == {} + + +@pytest.mark.asyncio +async def test_duplicate_open_replaces_silently(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="k", kind="tool", name="A")) + await tracer.handle(OpenSpan(key="k", kind="tool", name="B")) + await tracer.handle(CloseSpan(key="k")) + # Both opens started spans, but only the second ("B") is closed. + assert [name for name, _, _ in fake.started] == ["A", "B"] + assert fake.ended == [("B", None)] diff --git a/tests/lib/core/harness/test_types.py b/tests/lib/core/harness/test_types.py new file mode 100644 index 000000000..68bc89ce2 --- /dev/null +++ b/tests/lib/core/harness/test_types.py @@ -0,0 +1,53 @@ +from typing import AsyncIterator + +from agentex.lib.core.harness.types import ( + OpenSpan, + CloseSpan, + TurnUsage, + TurnResult, + HarnessTurn, + StreamTaskMessage, +) + + +def test_open_close_span_construct(): + o = OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}) + c = CloseSpan(key="call_1", output="files", is_complete=True) + assert o.key == c.key == "call_1" + assert o.kind == "tool" + assert c.is_complete is True + + +def test_turn_usage_defaults_are_none(): + u = TurnUsage(model="claude-opus-4-6") + assert u.model == "claude-opus-4-6" + assert u.input_tokens is None + assert u.num_tool_calls == 0 + + +def test_turn_result_wraps_usage(): + r = TurnResult(final_text="hi", usage=TurnUsage(model="m")) + assert r.final_text == "hi" + assert r.usage.model == "m" + + +def test_close_span_defaults(): + c = CloseSpan(key="x") + assert c.output is None + assert c.is_complete is True + + +def test_harness_turn_runtime_check(): + class _Turn: + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + async def _gen() -> AsyncIterator[StreamTaskMessage]: + if False: + yield # pragma: no cover + + return _gen() + + def usage(self) -> TurnUsage: + return TurnUsage(model="m") + + assert isinstance(_Turn(), HarnessTurn) is True diff --git a/tests/lib/core/harness/test_yield_delivery.py b/tests/lib/core/harness/test_yield_delivery.py new file mode 100644 index 000000000..f3f491d84 --- /dev/null +++ b/tests/lib/core/harness/test_yield_delivery.py @@ -0,0 +1,89 @@ +import types as _types + +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.core.harness.yield_delivery import yield_events + + +class _RecordTracing: + def __init__(self): + self.started, self.ended = [], [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append(name) + return _types.SimpleNamespace() # supports arbitrary attribute assignment (span.output = ...) + + async def end_span(self, *, trace_id, span): + self.ended.append(getattr(span, "output", None)) + + +async def _gen(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_yield_passes_events_through_and_traces(): + fake = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="c", name="Bash", content="ok" + ), + ), + ] + out = [e async for e in yield_events(_gen(events), tracer=tracer)] + assert out == events # passthrough unchanged + assert fake.started == ["Bash"] # span derived + opened + assert fake.ended == ["ok"] # span closed with response + + +@pytest.mark.asyncio +async def test_yield_without_tracer_is_pure_passthrough(): + events = [ + StreamTaskMessageDone(type="done", index=0), + ] + out = [e async for e in yield_events(_gen(events), tracer=None)] + assert out == events + + +@pytest.mark.asyncio +async def test_flush_runs_on_early_close(): + fake = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + # response intentionally never arrives + ] + gen = yield_events(_gen(events), tracer=tracer) + first = await gen.__anext__() # Start + second = await gen.__anext__() # Done -> tool span opens here + await gen.aclose() # triggers the finally -> flush() + assert fake.started == ["Bash"] + assert fake.ended == [None] # flush closed the unpaired span (incomplete, no output)